# Disaster tweets DL model

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn import base, feature_extraction, ensemble, model_selection, pipeline, compose, preprocessing, metrics
from sklearn.experimental import enable_halving_search_cv
import tensorflow as tf
from embedding_transformer import Doc2VecTransformer
from scikeras.wrappers import KerasClassifier
import optuna
import pprint

SCRIPT_NAME='DL-05'

2024-03-27 14:41:55.769099: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-27 14:41:55.789354: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv('./train_enriched.csv', index_col='id')
df_train.fillna({'keyword': '', 'location': '', 'country': '', 'state': '', 'city': '', 'url_domains': '', 'clean_text': ''}, inplace=True)
df_train.head()

Unnamed: 0_level_0,keyword,positive_factor,location,country,state,city,missing_location,text,clean_text,text_content,...,punct_factor,ann_count,urls_count,tokens_count,stop_words_factor,clean_tokens_factor,url_domains,url_redirects_count,hashtags_sentiment,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,0.5,,,,,1,Our Deeds are the Reason of this #earthquake M...,deed reason earthquake may allah forgive u,Our Deeds are the Reason of this #earthquake M...,...,0.017544,0,0,13,0.384615,0.615385,,0,1.0,1
1,,0.5,,,,,1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada,Forest fire near La Ronge Sask. Canada,...,0.03125,0,0,7,0.0,1.0,,0,0.0,1
2,,0.5,,,,,1,All residents asked to 'shelter in place' are ...,resident asked shelter place notified officer ...,All residents asked to 'shelter in place' are ...,...,0.026786,0,0,22,0.409091,0.590909,,0,0.0,1
3,,0.5,,,,,1,"13,000 people receive #wildfires evacuation or...",people receive wildfire evacuation order calif...,"13,000 people receive #wildfires evacuation or...",...,0.035088,0,0,9,0.111111,0.888889,,0,1.0,1
4,,0.5,,,,,1,Just got sent this photo from Ruby #Alaska as ...,got sent photo ruby alaska smoke wildfire pour...,Just got sent this photo from Ruby #Alaska as ...,...,0.027778,0,0,17,0.352941,0.647059,,0,0.714286,1


In [3]:
text_embedding = None
with open('./train-text-embeddings.pkl', 'rb') as fin:
    text_embedding = pickle.load(fin)

In [4]:
len(text_embedding), len(text_embedding[0])

(7613, 384)

In [5]:
class ConditionalEmbeddingTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, data):
        # Store the embeddings and mode
        self.data = data

    def fit(self, X, y=None):
        # No fitting necessary, return self
        return self

    def transform(self, X, y=None):
        return np.concatenate([X, self.data], axis=1)


In [6]:
categorical_features = [
    'country',
    'state',
]
numerical_features = [
    'text_length', 
    # 'ann_count',
    # 'url_redirects_count',
    # 'stop_words_factor',
    'positive_factor',
    'hashtags_sentiment'
]

# text_vec = feature_extraction.text.TfidfVectorizer(max_features=2000)
# text_vec = Doc2VecTransformer(vector_size=2000)
# text_vec = feature_extraction.text.CountVectorizer(max_features=1000)
# domains_vec = feature_extraction.text.TfidfVectorizer(max_features=100)
# domains_vec = feature_extraction.text.CountVectorizer(max_features=100)

column_transformer = compose.ColumnTransformer(transformers=[
    # ('text_vec', text_vec, 'clean_text'),
    # ('domains_vec', domains_vec, 'url_domains'),
    ('one_hot', preprocessing.OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ('numerical', preprocessing.StandardScaler(), numerical_features)
], remainder='drop')

embedding_transformer = ConditionalEmbeddingTransformer(text_embedding)

transformer = pipeline.Pipeline([
    ('columns', column_transformer),
    ('text_embedding', embedding_transformer)
])

transformer.fit(df_train)
X_train = transformer.transform(df_train)
print('X_train shape', X_train.shape)

Y_train = df_train['target']

print(f'X_train type={type(X_train)}, shape={X_train.shape}')
print(f'Y_train shape={Y_train.shape}')


X_train shape (7613, 554)
X_train type=<class 'numpy.ndarray'>, shape=(7613, 554)
Y_train shape=(7613,)


In [16]:
INPUT_SIZE = X_train.shape[1]
OUTPUT_SIZE = 1
HIDDEN_LAYER_SIZE = 16
BATCH_SIZE= 2 # int(0.1*X_train.shape[0])
MAX_EPOCHS = 20


def sigmoid(x):
    return tf.cast(tf.greater(tf.nn.sigmoid(x), .5), tf.int32)


def build_model(hidden_layers=HIDDEN_LAYER_SIZE, use_dropout=False, dropout_rate=0.1, learning_rate=1e-3, use_emma=False, emma_momentum=0.99):
    layers = [
        tf.keras.layers.Dense(hidden_layers, input_shape=(INPUT_SIZE,), activation='relu'),
        tf.keras.layers.Dense(hidden_layers, activation='relu'),
        tf.keras.layers.Dense(OUTPUT_SIZE, activation='linear')
    ]
    if use_dropout:
        layers = [
            layers[0],
            tf.keras.layers.Dropout(rate=dropout_rate),
            layers[1],
            tf.keras.layers.Dropout(rate=dropout_rate),
            layers[2]
        ]
    model = tf.keras.Sequential(layers)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, use_ema=use_emma, ema_momentum=emma_momentum),                 
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics='accuracy')
    return model


In [26]:
from sklearn.model_selection import StratifiedKFold

def objective(trial):
    use_dropout = True # trial.suggest_categorical('use_dropout', [True, False])
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.4) if use_dropout else .0
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
    hidden_layer_size = HIDDEN_LAYER_SIZE
    batch_size = trial.suggest_categorical('batch_size', [2, 4, 8])
    use_emma = False # trial.suggest_categorical('use_emma', [True, False])
    emma_momentum = .999 # trial.suggest_float('emma_momentum', 0.9, 0.9999, log=True) if use_emma else 0.999

    k = 5  # Number of folds
    kfold = StratifiedKFold(n_splits=k, shuffle=True)
    cvscores = []

    model = build_model(hidden_layers=hidden_layer_size, use_dropout=use_dropout, dropout_rate=dropout_rate, 
                        learning_rate=learning_rate, use_emma=use_emma, emma_momentum=emma_momentum)
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)

    for index, datasets in enumerate(kfold.split(X_train, Y_train)):
        train, test = datasets
        X_train_set = X_train[train]
        Y_train_set = Y_train[train]
        X_test_set = X_train[test]
        Y_test_set = Y_train[test]
        # print(f'---- step {index+1} of {k}')
        # print(f'train size: {len(X_train_set)}, test size: {len(X_test_set)}')
        
        model.fit(X_train_set, Y_train_set, batch_size=batch_size, epochs=MAX_EPOCHS, 
                validation_data=(X_test_set, Y_test_set),
                callbacks=[early_stopping], 
                verbose=0)

        Y_predict = sigmoid(model(X_test_set))

        f1_score = metrics.f1_score(Y_test_set, Y_predict)

        # print(f"Validation F1: {f1_score}")

        cvscores.append(f1_score)

    score = np.mean(cvscores)
    # print(f"Mean cross-validation F1 score: {score}")
    # print(f"Standard deviation of cross-validation score: {tf.math.reduce_std(cvscores)}")
    
    return score

In [27]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, n_jobs=4, show_progress_bar=True)

# Print the best hyperparameters and the corresponding accuracy
print('dir(study)', dir(study))

print('Best trial:')
trial = study.best_trial

print(f'-> Best score: {trial.value}')
print(f'-> Optimal hyperparameters: ')
pprint.pprint(trial.params)


[I 2024-03-27 17:11:17,704] A new study created in memory with name: no-name-1bedf0b7-ebc1-4980-9cd8-d0477840ccec
Best trial: 0. Best value: 0.840581:  10%|█         | 1/10 [01:43<15:32, 103.61s/it]

[I 2024-03-27 17:13:01,301] Trial 0 finished with value: 0.8405809951239277 and parameters: {'dropout_rate': 0.2582012910782223, 'learning_rate': 0.0004393653708624279, 'batch_size': 4}. Best is trial 0 with value: 0.8405809951239277.


Best trial: 0. Best value: 0.840581:  20%|██        | 2/10 [02:50<10:56, 82.04s/it] 

[I 2024-03-27 17:14:08,250] Trial 1 finished with value: 0.834120088998499 and parameters: {'dropout_rate': 0.1946338351937873, 'learning_rate': 5.6261992845547525e-05, 'batch_size': 4}. Best is trial 0 with value: 0.8405809951239277.


Best trial: 0. Best value: 0.840581:  30%|███       | 3/10 [03:02<05:51, 50.23s/it]

[I 2024-03-27 17:14:20,631] Trial 3 finished with value: 0.8319793933959241 and parameters: {'dropout_rate': 0.32295303347736726, 'learning_rate': 0.001105322306357548, 'batch_size': 2}. Best is trial 0 with value: 0.8405809951239277.


Best trial: 0. Best value: 0.840581:  40%|████      | 4/10 [03:20<03:44, 37.38s/it]

[I 2024-03-27 17:14:38,303] Trial 4 finished with value: 0.8383829852075013 and parameters: {'dropout_rate': 0.15430687116186914, 'learning_rate': 0.00334043129016256, 'batch_size': 4}. Best is trial 0 with value: 0.8405809951239277.


Best trial: 0. Best value: 0.840581:  50%|█████     | 5/10 [04:27<03:59, 47.92s/it]

[I 2024-03-27 17:15:44,917] Trial 5 finished with value: 0.8385629229713294 and parameters: {'dropout_rate': 0.10749707959041259, 'learning_rate': 0.001304653524994113, 'batch_size': 4}. Best is trial 0 with value: 0.8405809951239277.


Best trial: 0. Best value: 0.840581:  60%|██████    | 6/10 [04:51<02:38, 39.72s/it]

[I 2024-03-27 17:16:08,711] Trial 7 finished with value: 0.8370456798570107 and parameters: {'dropout_rate': 0.11080684372265459, 'learning_rate': 0.00011485897383928687, 'batch_size': 8}. Best is trial 0 with value: 0.8405809951239277.


Best trial: 0. Best value: 0.840581:  70%|███████   | 7/10 [05:09<01:38, 32.93s/it]

[I 2024-03-27 17:16:27,672] Trial 6 finished with value: 0.8322296484079039 and parameters: {'dropout_rate': 0.18917524964141058, 'learning_rate': 0.00012194559158878622, 'batch_size': 4}. Best is trial 0 with value: 0.8405809951239277.


Best trial: 0. Best value: 0.840581:  80%|████████  | 8/10 [05:57<01:14, 37.49s/it]

[I 2024-03-27 17:17:14,923] Trial 8 finished with value: 0.8404178969080466 and parameters: {'dropout_rate': 0.2809698089557967, 'learning_rate': 0.00043355846691961236, 'batch_size': 4}. Best is trial 0 with value: 0.8405809951239277.


Best trial: 0. Best value: 0.840581:  90%|█████████ | 9/10 [07:34<00:56, 56.09s/it]

[I 2024-03-27 17:18:51,926] Trial 9 finished with value: 0.823986402491796 and parameters: {'dropout_rate': 0.17509720367918657, 'learning_rate': 2.0272016135619728e-05, 'batch_size': 4}. Best is trial 0 with value: 0.8405809951239277.


Best trial: 0. Best value: 0.840581: 100%|██████████| 10/10 [07:43<00:00, 46.38s/it]

[I 2024-03-27 17:19:01,510] Trial 2 finished with value: 0.8275270725048054 and parameters: {'dropout_rate': 0.13672153764063116, 'learning_rate': 1.4160299726084621e-05, 'batch_size': 2}. Best is trial 0 with value: 0.8405809951239277.
dir(study) ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_ask', '_directions', '_get_trials', '_is_multi_objective', '_log_completed_trial', '_pop_waiting_trial_id', '_should_skip_enqueue', '_stop_flag', '_storage', '_study_id', '_tell', '_thread_local', 'add_trial', 'add_trials', 'ask', 'best_params', 'best_trial', 'best_trials', 'best_value', 'direction', 'directions', 'enqueue_trial', 'get_trials', 'metric_names', 'optimize'




#### Optimal model parameters

In [28]:
pprint.pprint(trial.params)

{'batch_size': 4,
 'dropout_rate': 0.2582012910782223,
 'learning_rate': 0.0004393653708624279}


#### Train model with optimal parameters

In [29]:
def train_best_model(best_params):
    batch_size = best_params.pop('batch_size', BATCH_SIZE)
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)
    best_model = build_model(**best_params)
    best_model.fit(X_train, Y_train, batch_size=batch_size, epochs=MAX_EPOCHS, validation_split=0.2,
                callbacks=[early_stopping], verbose=3)
    Y_predict = sigmoid(best_model(X_train))
    f1_score = metrics.f1_score(Y_train, Y_predict)
    print(f'Best model F1={f1_score:.3f}')
    return best_model

best_model = train_best_model(trial.params)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Best model F1=0.839


In [25]:
best_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 16)                8880      
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_4 (Dense)             (None, 16)                272       
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 9,169
Trainable params: 9,169
Non-trainable params: 0
_________________________________________________________________


#### Generate output

In [21]:
best_model.save(SCRIPT_NAME)

2024-03-27 17:02:38.629705: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,16]
	 [[{{node inputs}}]]
2024-03-27 17:02:38.637405: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,16]
	 [[{{node inputs}}]]
2024-03-27 17:02:38.720839: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,16]
	 [[{{node inputs}}]]
2024-03-27 17:02:38

INFO:tensorflow:Assets written to: DL-05/assets


INFO:tensorflow:Assets written to: DL-05/assets


In [22]:
df_test = pd.read_csv('./test_enriched.csv', index_col='id')
df_test.fillna({'keyword': '', 'location': '', 'country': '', 'state': '', 'city': '', 'url_domains': ''}, inplace=True)
df_test.shape

(3263, 22)

In [23]:
test_embedding = None
with open('./test-text-embeddings.pkl', 'rb') as fin:
    test_embedding = pickle.load(fin)
len(test_embedding), len(test_embedding[0])

(3263, 384)

In [24]:
embedding_transformer.data = test_embedding
X_test = transformer.transform(df_test)
print('X_test shape', X_test.shape)

Y_test_predict = sigmoid(best_model(X_test))

df_example = pd.read_csv('./sample_submission.csv')
df_example['target'] = Y_test_predict

df_example.to_csv(f'./{SCRIPT_NAME}-submission.csv', index=False)

X_test shape (3263, 554)
