# Disaster tweets DL model

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn import base, feature_extraction, ensemble, model_selection, pipeline, compose, preprocessing, metrics
from sklearn.experimental import enable_halving_search_cv
import tensorflow as tf
from embedding_transformer import Doc2VecTransformer
from scikeras.wrappers import KerasClassifier
import optuna
import pprint

SCRIPT_NAME='DL-06'

2024-04-04 22:19:06.312049: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-04 22:19:06.357977: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv('./train_enriched.csv', index_col='id')
df_train.fillna({'keyword': '', 'location': '', 'country': '', 'state': '', 'city': '', 'url_domains': '', 'clean_text': ''}, inplace=True)
df_train.head()

Unnamed: 0_level_0,keyword,positive_factor,location,country,state,city,missing_location,text,clean_text,text_content,...,punct_factor,ann_count,urls_count,tokens_count,stop_words_factor,clean_tokens_factor,url_domains,url_redirects_count,hashtags_sentiment,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,0.5,,,,,1,Our Deeds are the Reason of this #earthquake M...,deed reason earthquake may allah forgive u,Our Deeds are the Reason of this #earthquake M...,...,0.017544,0,0,13,0.384615,0.615385,,0,1.0,1
1,,0.5,,,,,1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada,Forest fire near La Ronge Sask. Canada,...,0.03125,0,0,7,0.0,1.0,,0,0.0,1
2,,0.5,,,,,1,All residents asked to 'shelter in place' are ...,resident asked shelter place notified officer ...,All residents asked to 'shelter in place' are ...,...,0.026786,0,0,22,0.409091,0.590909,,0,0.0,1
3,,0.5,,,,,1,"13,000 people receive #wildfires evacuation or...",people receive wildfire evacuation order calif...,"13,000 people receive #wildfires evacuation or...",...,0.035088,0,0,9,0.111111,0.888889,,0,1.0,1
4,,0.5,,,,,1,Just got sent this photo from Ruby #Alaska as ...,got sent photo ruby alaska smoke wildfire pour...,Just got sent this photo from Ruby #Alaska as ...,...,0.027778,0,0,17,0.352941,0.647059,,0,0.714286,1


In [3]:
text_embedding = None
with open('./train-text-embeddings.pkl', 'rb') as fin:
    text_embedding = pickle.load(fin)

In [4]:
len(text_embedding), len(text_embedding[0])

(7613, 384)

In [5]:
class ConditionalEmbeddingTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, data):
        # Store the embeddings and mode
        self.data = data

    def fit(self, X, y=None):
        # No fitting necessary, return self
        return self

    def transform(self, X, y=None):
        return np.concatenate([X, self.data], axis=1)


In [6]:
categorical_features = [
    'country',
    'state',
]
numerical_features = [
    'text_length', 
    # 'ann_count',
    # 'url_redirects_count',
    # 'stop_words_factor',
    'positive_factor',
    # 'hashtags_sentiment'
]

# domains_vec = feature_extraction.text.TfidfVectorizer(max_features=100)
# domains_vec = feature_extraction.text.CountVectorizer(max_features=100)

column_transformer = compose.ColumnTransformer(transformers=[
    # ('domains_vec', domains_vec, 'url_domains'),
    ('one_hot', preprocessing.OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ('numerical', preprocessing.StandardScaler(), numerical_features)
], remainder='drop')

embedding_transformer = ConditionalEmbeddingTransformer(text_embedding)

transformer = pipeline.Pipeline([
    ('columns', column_transformer),
    ('text_embedding', embedding_transformer)
])

transformer.fit(df_train)
X_train = transformer.transform(df_train)
print('X_train shape', X_train.shape)

Y_train = df_train['target']

print(f'X_train type={type(X_train)}, shape={X_train.shape}')
print(f'Y_train shape={Y_train.shape}')


X_train shape (7613, 553)
X_train type=<class 'numpy.ndarray'>, shape=(7613, 553)
Y_train shape=(7613,)


In [17]:
INPUT_SIZE = X_train.shape[1]
OUTPUT_SIZE = 1
NN_SHAPE = [INPUT_SIZE, 64, 64, 1]
BATCH_SIZE= X_train.shape[0]
MAX_EPOCHS = 20


def sigmoid(x):
    return tf.cast(tf.greater(tf.nn.sigmoid(x), .5), tf.int32)


def build_model(layer_dims, use_dropout=False, dropout_rate_1=0.3, dropout_rate_2=0.1,
                learning_rate=1e-3, 
                use_emma=False, emma_momentum=0.99, 
                use_regularizer=False, regularizer=0.01,
                initializer='glorot_normal',
                activation='relu'
                ):
    n_layers = len(layer_dims)
    layers = []

    for l in range(1, n_layers-1):
        layer_kws = {}

        if use_regularizer:
            layer_kws['kernel_regularizer'] = tf.keras.regularizers.l2(regularizer)
        if initializer:
            layer_kws['kernel_initializer'] = initializer
        
        hidden_layer= tf.keras.layers.Dense(layer_dims[l], input_shape=(layer_dims[l-1],), activation=activation, **layer_kws)
        
        layers.append(hidden_layer)
        
        if use_dropout:
            if l==1:
                rate = dropout_rate_1
            elif l==2 and n_layers > 3:
                rate = dropout_rate_2
            else:
                rate = 0.0
            if rate > .0:
                layers.append(tf.keras.layers.Dropout(rate=rate))
    layers.append(tf.keras.layers.Dense(layer_dims[n_layers-1], activation='linear'))
    model = tf.keras.Sequential(layers)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, use_ema=use_emma, ema_momentum=emma_momentum),                 
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics='accuracy')
    return model


In [18]:
def objective(trial):
    use_dropout = False # trial.suggest_categorical('use_dropout', [True, False])
    dropout_rate_1 = .0 # trial.suggest_float('dropout_rate_1', 0.1, 0.4) if use_dropout else .0
    dropout_rate_2 = .0 # trial.suggest_float('dropout_rate_2', 0.0, 0.2) if use_dropout else .0
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    batch_size = trial.suggest_categorical('batch_size', [BATCH_SIZE, BATCH_SIZE//2 + 1, BATCH_SIZE//4 + 1])
    use_emma = False # trial.suggest_categorical('use_emma', [True, False])
    emma_momentum =.0 # trial.suggest_float('emma_momentum', 0.5, 0.9, log=True) if use_emma else 0.999
    regularizer = trial.suggest_float('regularizer', 1e-5, 1e-2, log=True)
    initializer = trial.suggest_categorical('initializer', ['glorot_normal', 'he_normal'])
    activation = trial.suggest_categorical('activation', ['relu', 'gelu', 'leaky_relu'])

    k = 5  # Number of validations
    shuffle_split = model_selection.StratifiedShuffleSplit(n_splits=k, test_size=0.2)
    cvscores = []

    model = build_model(layer_dims=NN_SHAPE, 
                        use_dropout=use_dropout, dropout_rate_1=dropout_rate_1, dropout_rate_2=dropout_rate_2,
                        initializer=initializer,
                        regularizer=regularizer, 
                        activation=activation,
                        learning_rate=learning_rate, use_emma=use_emma, emma_momentum=emma_momentum)
    
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)

    for index, datasets in enumerate(shuffle_split.split(X_train, Y_train)):
        train, test = datasets
        X_train_set = X_train[train]
        Y_train_set = Y_train[train]
        X_test_set = X_train[test]
        Y_test_set = Y_train[test]
        # print(f'---- step {index+1} of {k}')
        # print(f'train size: {len(X_train_set)}, test size: {len(X_test_set)}')
        
        model.fit(X_train_set, Y_train_set, batch_size=batch_size, epochs=MAX_EPOCHS, 
                validation_data=(X_test_set, Y_test_set),
                callbacks=[early_stopping],
                verbose=0)

        Y_predict = sigmoid(model.predict(X_test_set))

        f1_score = metrics.f1_score(Y_test_set, Y_predict)
        # print(f"Validation F1: {f1_score}")

        cvscores.append(f1_score)

    score = np.mean(cvscores)
    print(f"Mean cross-validation F1 score: {score}")
    # print(f"Standard deviation of cross-validation score: {tf.math.reduce_std(cvscores)}")
    # print(model.summary())
    
    return score

In [19]:
study_name=SCRIPT_NAME
storage=f"sqlite:///{SCRIPT_NAME}.optuna.db"

# recreate study for new NN architecture
try:
    optuna.delete_study(study_name=study_name, storage=storage)
except:
    pass


In [20]:
study = optuna.create_study(study_name=study_name, storage=storage,
                            direction='maximize', 
                            sampler=optuna.samplers.TPESampler(seed=42, consider_prior=True),
                            load_if_exists=True)
study.optimize(objective, n_trials=10, show_progress_bar=True)

# Print optimal hyperparameters and the corresponding score

trial = study.best_trial
print(f'-> Best score: {trial.value}')
print(f'-> Optimal hyperparameters: ')
pprint.pprint(trial.params)


[I 2024-04-04 22:22:48,460] A new study created in RDB with name: DL-06
  0%|          | 0/10 [00:00<?, ?it/s]



Best trial: 0. Best value: 0.743431:  10%|█         | 1/10 [00:05<00:53,  5.91s/it]

Mean cross-validation F1 score: 0.743430942186669
[I 2024-04-04 22:22:54,363] Trial 0 finished with value: 0.743430942186669 and parameters: {'learning_rate': 0.00031489116479568613, 'batch_size': 7613, 'regularizer': 2.9380279387035334e-05, 'initializer': 'glorot_normal', 'activation': 'relu'}. Best is trial 0 with value: 0.743430942186669.


Best trial: 0. Best value: 0.743431:  20%|██        | 2/10 [00:11<00:45,  5.74s/it]

Mean cross-validation F1 score: 0.699155844249646
[I 2024-04-04 22:22:59,986] Trial 1 finished with value: 0.699155844249646 and parameters: {'learning_rate': 1.2087541473056957e-05, 'batch_size': 7613, 'regularizer': 3.511356313970405e-05, 'initializer': 'he_normal', 'activation': 'relu'}. Best is trial 0 with value: 0.743430942186669.


Best trial: 2. Best value: 0.806932:  30%|███       | 3/10 [00:15<00:33,  4.77s/it]

Mean cross-validation F1 score: 0.8069321037289597
[I 2024-04-04 22:23:03,605] Trial 2 finished with value: 0.8069321037289597 and parameters: {'learning_rate': 0.0028016351587162596, 'batch_size': 1904, 'regularizer': 0.00023345864076016249, 'initializer': 'glorot_normal', 'activation': 'gelu'}. Best is trial 2 with value: 0.8069321037289597.


Best trial: 3. Best value: 0.808402:  40%|████      | 4/10 [00:18<00:25,  4.25s/it]

Mean cross-validation F1 score: 0.8084019932237224
[I 2024-04-04 22:23:07,059] Trial 3 finished with value: 0.8084019932237224 and parameters: {'learning_rate': 0.0026926469100861782, 'batch_size': 1904, 'regularizer': 0.00788671412999049, 'initializer': 'glorot_normal', 'activation': 'gelu'}. Best is trial 3 with value: 0.8084019932237224.


Best trial: 3. Best value: 0.808402:  50%|█████     | 5/10 [00:25<00:26,  5.27s/it]

Mean cross-validation F1 score: 0.2706746754961221
[I 2024-04-04 22:23:14,145] Trial 4 finished with value: 0.2706746754961221 and parameters: {'learning_rate': 3.077180271250682e-05, 'batch_size': 1904, 'regularizer': 5.9750279999602906e-05, 'initializer': 'glorot_normal', 'activation': 'gelu'}. Best is trial 3 with value: 0.8084019932237224.


Best trial: 3. Best value: 0.808402:  60%|██████    | 6/10 [00:28<00:18,  4.59s/it]

Mean cross-validation F1 score: 0.7658286639530932
[I 2024-04-04 22:23:17,397] Trial 5 finished with value: 0.7658286639530932 and parameters: {'learning_rate': 0.07556810141274425, 'batch_size': 3807, 'regularizer': 0.0006218704727769079, 'initializer': 'glorot_normal', 'activation': 'leaky_relu'}. Best is trial 3 with value: 0.8084019932237224.


Best trial: 3. Best value: 0.808402:  70%|███████   | 7/10 [00:34<00:14,  4.92s/it]

Mean cross-validation F1 score: 0.7727282125329069
[I 2024-04-04 22:23:23,008] Trial 6 finished with value: 0.7727282125329069 and parameters: {'learning_rate': 0.00035868164986275477, 'batch_size': 3807, 'regularizer': 6.963114377829287e-05, 'initializer': 'glorot_normal', 'activation': 'leaky_relu'}. Best is trial 3 with value: 0.8084019932237224.


Best trial: 3. Best value: 0.808402:  80%|████████  | 8/10 [00:37<00:08,  4.26s/it]

Mean cross-validation F1 score: 0.8037977494320355
[I 2024-04-04 22:23:25,840] Trial 7 finished with value: 0.8037977494320355 and parameters: {'learning_rate': 0.012273800987852962, 'batch_size': 1904, 'regularizer': 0.001319994226153501, 'initializer': 'he_normal', 'activation': 'gelu'}. Best is trial 3 with value: 0.8084019932237224.


Best trial: 3. Best value: 0.808402:  90%|█████████ | 9/10 [00:40<00:03,  3.76s/it]

Mean cross-validation F1 score: 0.7933505280948496
[I 2024-04-04 22:23:28,501] Trial 8 finished with value: 0.7933505280948496 and parameters: {'learning_rate': 0.028340904295147733, 'batch_size': 7613, 'regularizer': 8.569331925053983e-05, 'initializer': 'he_normal', 'activation': 'gelu'}. Best is trial 3 with value: 0.8084019932237224.


Best trial: 3. Best value: 0.808402: 100%|██████████| 10/10 [00:44<00:00,  4.49s/it]

Mean cross-validation F1 score: 0.6763480087419891
[I 2024-04-04 22:23:33,318] Trial 9 finished with value: 0.6763480087419891 and parameters: {'learning_rate': 3.0086868214458443e-05, 'batch_size': 3807, 'regularizer': 0.002055424552015075, 'initializer': 'he_normal', 'activation': 'relu'}. Best is trial 3 with value: 0.8084019932237224.
-> Best score: 0.8084019932237224
-> Optimal hyperparameters: 
{'activation': 'gelu',
 'batch_size': 1904,
 'initializer': 'glorot_normal',
 'learning_rate': 0.0026926469100861782,
 'regularizer': 0.00788671412999049}





#### Optimal model parameters

In [15]:
print(f'-> Best score: {trial.value}')
pprint.pprint(trial.params)

-> Best score: 0.8618254341828041
{'emma_momentum': 0.9585386269812564,
 'learning_rate': 0.00031489116479568613,
 'regularizer': 2.9380279387035334e-05,
 'use_emma': True}


#### Train model with optimal parameters

In [16]:
def train_best_model(best_params):
    batch_size = best_params.pop('batch_size', BATCH_SIZE)
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)
    best_model = build_model(layer_dims=NN_SHAPE, **best_params)
    best_model.fit(X_train, Y_train, batch_size=batch_size, epochs=MAX_EPOCHS, validation_split=0.2,
                callbacks=[early_stopping], verbose=3)
    Y_predict = sigmoid(best_model(X_train))
    f1_score = metrics.f1_score(Y_train, Y_predict)
    print(f'Best model F1={f1_score:.3f}')
    return best_model

best_model = train_best_model(trial.params)

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Best model F1=0.878


In [17]:
best_model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_40 (Dense)            (None, 2048)              1136640   
                                                                 
 dense_41 (Dense)            (None, 1024)              2098176   
                                                                 
 dense_42 (Dense)            (None, 64)                65600     
                                                                 
 dense_43 (Dense)            (None, 1)                 65        
                                                                 
Total params: 3,300,481
Trainable params: 3,300,481
Non-trainable params: 0
_________________________________________________________________


#### Generate output

In [18]:
best_model.save(SCRIPT_NAME)



INFO:tensorflow:Assets written to: DL-06/assets


INFO:tensorflow:Assets written to: DL-06/assets


In [19]:
df_test = pd.read_csv('./test_enriched.csv', index_col='id')
df_test.fillna({'keyword': '', 'location': '', 'country': '', 'state': '', 'city': '', 'url_domains': ''}, inplace=True)
df_test.shape

(3263, 22)

In [20]:
test_embedding = None
with open('./test-text-embeddings.pkl', 'rb') as fin:
    test_embedding = pickle.load(fin)
len(test_embedding), len(test_embedding[0])

(3263, 384)

In [21]:
embedding_transformer.data = test_embedding
X_test = transformer.transform(df_test)
print('X_test shape', X_test.shape)

Y_test_predict = sigmoid(best_model(X_test))

df_example = pd.read_csv('./sample_submission.csv')
df_example['target'] = Y_test_predict

df_example.to_csv(f'./{SCRIPT_NAME}-submission.csv', index=False)

X_test shape (3263, 554)
