# Disaster tweets DL model

In [45]:
import pickle
import numpy as np
import pandas as pd
from sklearn import base, feature_extraction, ensemble, model_selection, pipeline, compose, preprocessing, metrics
from sklearn.experimental import enable_halving_search_cv
import tensorflow as tf
from embedding_transformer import Doc2VecTransformer
from scikeras.wrappers import KerasClassifier
import optuna
import pprint

SCRIPT_NAME='DL-06'

In [46]:
df_train = pd.read_csv('./train_enriched.csv', index_col='id')
df_train.fillna({'keyword': '', 'location': '', 'country': '', 'state': '', 'city': '', 'url_domains': '', 'clean_text': ''}, inplace=True)
df_train.head()

Unnamed: 0_level_0,keyword,positive_factor,location,country,state,city,missing_location,text,clean_text,text_content,...,punct_factor,ann_count,urls_count,tokens_count,stop_words_factor,clean_tokens_factor,url_domains,url_redirects_count,hashtags_sentiment,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,0.5,,,,,1,Our Deeds are the Reason of this #earthquake M...,deed reason earthquake may allah forgive u,Our Deeds are the Reason of this #earthquake M...,...,0.017544,0,0,13,0.384615,0.615385,,0,1.0,1
1,,0.5,,,,,1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada,Forest fire near La Ronge Sask. Canada,...,0.03125,0,0,7,0.0,1.0,,0,0.0,1
2,,0.5,,,,,1,All residents asked to 'shelter in place' are ...,resident asked shelter place notified officer ...,All residents asked to 'shelter in place' are ...,...,0.026786,0,0,22,0.409091,0.590909,,0,0.0,1
3,,0.5,,,,,1,"13,000 people receive #wildfires evacuation or...",people receive wildfire evacuation order calif...,"13,000 people receive #wildfires evacuation or...",...,0.035088,0,0,9,0.111111,0.888889,,0,1.0,1
4,,0.5,,,,,1,Just got sent this photo from Ruby #Alaska as ...,got sent photo ruby alaska smoke wildfire pour...,Just got sent this photo from Ruby #Alaska as ...,...,0.027778,0,0,17,0.352941,0.647059,,0,0.714286,1


In [47]:
text_embedding = None
with open('./train-text-embeddings.pkl', 'rb') as fin:
    text_embedding = pickle.load(fin)

In [48]:
len(text_embedding), len(text_embedding[0])

(7613, 384)

In [49]:
class ConditionalEmbeddingTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, data):
        # Store the embeddings and mode
        self.data = data

    def fit(self, X, y=None):
        # No fitting necessary, return self
        return self

    def transform(self, X, y=None):
        return np.concatenate([X, self.data], axis=1)


In [50]:
categorical_features = [
    'country',
    'state',
]
numerical_features = [
    'text_length', 
    # 'ann_count',
    # 'url_redirects_count',
    # 'stop_words_factor',
    'positive_factor',
    'hashtags_sentiment'
]

# domains_vec = feature_extraction.text.TfidfVectorizer(max_features=100)
# domains_vec = feature_extraction.text.CountVectorizer(max_features=100)

column_transformer = compose.ColumnTransformer(transformers=[
    # ('domains_vec', domains_vec, 'url_domains'),
    ('one_hot', preprocessing.OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ('numerical', preprocessing.StandardScaler(), numerical_features)
], remainder='drop')

embedding_transformer = ConditionalEmbeddingTransformer(text_embedding)

transformer = pipeline.Pipeline([
    ('columns', column_transformer),
    ('text_embedding', embedding_transformer)
])

transformer.fit(df_train)
X_train = transformer.transform(df_train)
print('X_train shape', X_train.shape)

Y_train = df_train['target']

print(f'X_train type={type(X_train)}, shape={X_train.shape}')
print(f'Y_train shape={Y_train.shape}')


X_train shape (7613, 554)
X_train type=<class 'numpy.ndarray'>, shape=(7613, 554)
Y_train shape=(7613,)


In [68]:
INPUT_SIZE = X_train.shape[1]
OUTPUT_SIZE = 1
NN_SHAPE = [INPUT_SIZE, 256, 256, 64, 1]
BATCH_SIZE= 8
MAX_EPOCHS = 20


def sigmoid(x):
    return tf.cast(tf.greater(tf.nn.sigmoid(x), .5), tf.int32)


def build_model(layer_dims, use_dropout=False, dropout_rate_1=0.3, dropout_rate_2=0.1,
                learning_rate=1e-3, 
                use_emma=False, emma_momentum=0.99, regularizer=0.01):
    n_layers = len(layer_dims)
    layers = []
    initializer = tf.keras.initializers.HeNormal()
    for l in range(1, n_layers-1):
        tf.keras.layers.Dense(layer_dims[l], input_shape=(layer_dims[l-1],), activation='relu', 
                              kernel_initializer=initializer, kernel_regularizer=tf.keras.regularizers.l2(regularizer))
        if use_dropout:
            if l==1:
                rate = dropout_rate_1
            elif l==2 and n_layers > 3:
                rate = dropout_rate_2
            else:
                rate = 0.0
            layers.append(tf.keras.layers.Dropout(rate=rate))
    layers.append(tf.keras.layers.Dense(layer_dims[n_layers-1], activation='linear'))
    model = tf.keras.Sequential(layers)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, use_ema=use_emma, ema_momentum=emma_momentum),                 
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics='accuracy')
    return model


In [69]:
def objective(trial):
    use_dropout = trial.suggest_categorical('use_dropout', [True, False])
    dropout_rate_1 = trial.suggest_float('dropout_rate_1', 0.1, 0.4) if use_dropout else .0
    dropout_rate_2 = trial.suggest_float('dropout_rate_2', 0.0, 0.2) if use_dropout else .0
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    batch_size = trial.suggest_categorical('batch_size', [2, 4, 8, 16])
    use_emma = False # trial.suggest_categorical('use_emma', [True, False])
    emma_momentum = .0 # trial.suggest_float('emma_momentum', 0.9, 0.9999, log=True) if use_emma else 0.999
    regularizer = False # trial.suggest_float('regularizer', 1e-5, 1e-2, log=True)

    k = 3  # Number of validations
    shuffle_split = model_selection.StratifiedShuffleSplit(n_splits=k, test_size=0.2)
    cvscores = []

    model = build_model(layer_dims=NN_SHAPE, 
                        use_dropout=use_dropout, dropout_rate_1=dropout_rate_1, dropout_rate_2=dropout_rate_2,
                        regularizer=regularizer, 
                        learning_rate=learning_rate, use_emma=use_emma, emma_momentum=emma_momentum)
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)

    for index, datasets in enumerate(shuffle_split.split(X_train, Y_train)):
        train, test = datasets
        X_train_set = X_train[train]
        Y_train_set = Y_train[train]
        X_test_set = X_train[test]
        Y_test_set = Y_train[test]
        # print(f'---- step {index+1} of {k}')
        # print(f'train size: {len(X_train_set)}, test size: {len(X_test_set)}')
        
        model.fit(X_train_set, Y_train_set, batch_size=batch_size, epochs=MAX_EPOCHS, 
                validation_data=(X_test_set, Y_test_set),
                callbacks=[early_stopping],
                verbose=0)

        Y_predict = sigmoid(model.predict(X_test_set))

        f1_score = metrics.f1_score(Y_test_set, Y_predict)
        # print(f"Validation F1: {f1_score}")

        cvscores.append(f1_score)

    score = np.mean(cvscores)
    print(f"Mean cross-validation F1 score: {score}")
    # print(f"Standard deviation of cross-validation score: {tf.math.reduce_std(cvscores)}")
    
    return score

In [70]:
study_name=SCRIPT_NAME
storage=f"sqlite:///{SCRIPT_NAME}.optuna.db"

# recreate study for new NN architecture
try:
    optuna.delete_study(study_name=study_name, storage=storage)
except:
    pass


In [71]:
study = optuna.create_study(study_name=study_name, storage=storage,
                            direction='maximize', 
                            sampler=optuna.samplers.TPESampler(seed=42, consider_prior=True),
                            load_if_exists=True)
study.optimize(objective, n_trials=10, show_progress_bar=True)

# Print optimal hyperparameters and the corresponding score

trial = study.best_trial
print(f'-> Best score: {trial.value}')
print(f'-> Optimal hyperparameters: ')
pprint.pprint(trial.params)


[I 2024-04-02 09:08:06,642] A new study created in RDB with name: DL-06
  0%|          | 0/10 [00:00<?, ?it/s]



Best trial: 0. Best value: 0.83223:  10%|█         | 1/10 [00:26<03:58, 26.48s/it]

Mean cross-validation F1 score: 0.8322304785845218
[I 2024-04-02 09:08:33,128] Trial 0 finished with value: 0.8322304785845218 and parameters: {'use_dropout': False, 'learning_rate': 0.008471801418819975, 'batch_size': 2}. Best is trial 0 with value: 0.8322304785845218.


Best trial: 0. Best value: 0.83223:  20%|██        | 2/10 [01:07<04:38, 34.87s/it]

Mean cross-validation F1 score: 0.8067718252186937
[I 2024-04-02 09:09:13,869] Trial 1 finished with value: 0.8067718252186937 and parameters: {'use_dropout': True, 'dropout_rate_1': 0.3124217733388137, 'dropout_rate_2': 0.004116898859160489, 'learning_rate': 0.07579479953348005, 'batch_size': 2}. Best is trial 0 with value: 0.8322304785845218.


Best trial: 0. Best value: 0.83223:  30%|███       | 3/10 [01:54<04:42, 40.38s/it]

Mean cross-validation F1 score: 0.8280592105022032
[I 2024-04-02 09:10:00,800] Trial 2 finished with value: 0.8280592105022032 and parameters: {'use_dropout': False, 'learning_rate': 0.0005342937261279777, 'batch_size': 4}. Best is trial 0 with value: 0.8322304785845218.


Best trial: 0. Best value: 0.83223:  40%|████      | 4/10 [02:03<02:47, 27.95s/it]

Mean cross-validation F1 score: 0.8226004414181091
[I 2024-04-02 09:10:09,711] Trial 3 finished with value: 0.8226004414181091 and parameters: {'use_dropout': False, 'learning_rate': 0.013826232179369865, 'batch_size': 8}. Best is trial 0 with value: 0.8322304785845218.


Best trial: 0. Best value: 0.83223:  50%|█████     | 5/10 [02:53<03:00, 36.17s/it]

Mean cross-validation F1 score: 0.8012561601715259
[I 2024-04-02 09:11:00,467] Trial 4 finished with value: 0.8012561601715259 and parameters: {'use_dropout': True, 'dropout_rate_1': 0.11951547789558387, 'dropout_rate_2': 0.18977710745066667, 'learning_rate': 0.07286653737491042, 'batch_size': 2}. Best is trial 0 with value: 0.8322304785845218.


Best trial: 0. Best value: 0.83223:  60%|██████    | 6/10 [03:14<02:03, 31.00s/it]

Mean cross-validation F1 score: 0.8182585856867198
[I 2024-04-02 09:11:21,404] Trial 5 finished with value: 0.8182585856867198 and parameters: {'use_dropout': True, 'dropout_rate_1': 0.2485530730333811, 'dropout_rate_2': 0.006877704223043679, 'learning_rate': 0.0433792069749094, 'batch_size': 4}. Best is trial 0 with value: 0.8322304785845218.


Best trial: 0. Best value: 0.83223:  70%|███████   | 7/10 [03:25<01:13, 24.38s/it]

Mean cross-validation F1 score: 0.8022261107980642
[I 2024-04-02 09:11:32,171] Trial 6 finished with value: 0.8022261107980642 and parameters: {'use_dropout': True, 'dropout_rate_1': 0.39087538832936763, 'dropout_rate_2': 0.15502656467222292, 'learning_rate': 0.057279044707996205, 'batch_size': 8}. Best is trial 0 with value: 0.8322304785845218.


Best trial: 0. Best value: 0.83223:  80%|████████  | 8/10 [05:56<02:09, 64.69s/it]

Mean cross-validation F1 score: 0.8262679449822562
[I 2024-04-02 09:14:03,179] Trial 7 finished with value: 0.8262679449822562 and parameters: {'use_dropout': True, 'dropout_rate_1': 0.19759909922897934, 'dropout_rate_2': 0.07773545793789641, 'learning_rate': 0.00012172958098369953, 'batch_size': 2}. Best is trial 0 with value: 0.8322304785845218.


Best trial: 0. Best value: 0.83223:  90%|█████████ | 9/10 [09:02<01:42, 102.66s/it]

Mean cross-validation F1 score: 0.7825249394868652
[I 2024-04-02 09:17:09,304] Trial 8 finished with value: 0.7825249394868652 and parameters: {'use_dropout': False, 'learning_rate': 1.9870215385428627e-05, 'batch_size': 2}. Best is trial 0 with value: 0.8322304785845218.


Best trial: 0. Best value: 0.83223: 100%|██████████| 10/10 [10:02<00:00, 60.20s/it]

Mean cross-validation F1 score: 0.7657339541864617
[I 2024-04-02 09:18:08,653] Trial 9 finished with value: 0.7657339541864617 and parameters: {'use_dropout': True, 'dropout_rate_1': 0.3187021504122962, 'dropout_rate_2': 0.15425406933718916, 'learning_rate': 1.97778285124627e-05, 'batch_size': 8}. Best is trial 0 with value: 0.8322304785845218.
-> Best score: 0.8322304785845218
-> Optimal hyperparameters: 
{'batch_size': 2, 'learning_rate': 0.008471801418819975, 'use_dropout': False}





#### Optimal model parameters

In [17]:
print(f'-> Best score: {trial.value}')
pprint.pprint(trial.params)

-> Best score: 0.833765885392289
{'batch_size': 4, 'learning_rate': 0.004835952776465951, 'use_dropout': False}


#### Train model with optimal parameters

In [29]:
def train_best_model(best_params):
    batch_size = best_params.pop('batch_size', BATCH_SIZE)
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)
    best_model = build_model(layer_dims=NN_SHAPE, **best_params)
    best_model.fit(X_train, Y_train, batch_size=batch_size, epochs=MAX_EPOCHS, validation_split=0.2,
                callbacks=[early_stopping], verbose=3)
    Y_predict = sigmoid(best_model(X_train))
    f1_score = metrics.f1_score(Y_train, Y_predict)
    print(f'Best model F1={f1_score:.3f}')
    return best_model

best_model = train_best_model(trial.params)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Best model F1=0.826


In [74]:
best_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 1024)              568320    
                                                                 
 dense_4 (Dense)             (None, 1024)              1049600   
                                                                 
 dense_5 (Dense)             (None, 1)                 1025      
                                                                 
Total params: 1,618,945
Trainable params: 1,618,945
Non-trainable params: 0
_________________________________________________________________


#### Generate output

In [75]:
best_model.save(SCRIPT_NAME)



INFO:tensorflow:Assets written to: DL-06/assets


INFO:tensorflow:Assets written to: DL-06/assets


In [76]:
df_test = pd.read_csv('./test_enriched.csv', index_col='id')
df_test.fillna({'keyword': '', 'location': '', 'country': '', 'state': '', 'city': '', 'url_domains': ''}, inplace=True)
df_test.shape

(3263, 22)

In [77]:
test_embedding = None
with open('./test-text-embeddings.pkl', 'rb') as fin:
    test_embedding = pickle.load(fin)
len(test_embedding), len(test_embedding[0])

(3263, 384)

In [78]:
embedding_transformer.data = test_embedding
X_test = transformer.transform(df_test)
print('X_test shape', X_test.shape)

Y_test_predict = sigmoid(best_model(X_test))

df_example = pd.read_csv('./sample_submission.csv')
df_example['target'] = Y_test_predict

df_example.to_csv(f'./{SCRIPT_NAME}-submission.csv', index=False)

X_test shape (3263, 554)
