In [18]:
import pickle
import numpy as np
import pandas as pd
from sklearn import base, feature_extraction, ensemble, model_selection, pipeline, compose, preprocessing, metrics
from sklearn.experimental import enable_halving_search_cv
import tensorflow as tf
from embedding_transformer import Doc2VecTransformer
from scikeras.wrappers import KerasClassifier
import optuna
import pprint

SCRIPT_NAME='DL-07'

In [35]:
df_dataset = pd.read_csv('./train_enriched.csv', index_col='id')
df_train, df_validation = model_selection.train_test_split(df_dataset, test_size=0.1)
df_train.shape, df_validation.shape

((6851, 25), (762, 25))

In [36]:
df_train.fillna({'keyword': '', 'location': '', 'country': '', 'state': '', 'city': '', 'url_domains': '', 'clean_text': ''}, inplace=True)
df_train.head()

Unnamed: 0_level_0,keyword,positive_factor,location,country,state,city,missing_location,text,clean_text,text_content,...,urls_count,tokens_count,stop_words_factor,clean_tokens_factor,url_domains,url_redirects_count,hashtags_sentiment,token_sentiment,token_sentiment_2,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1416,casualties,0.8,USA,United States,,,0,Another day has passed and THANKFULLY Central ...,another day has passed and thankfully central ...,Another day has passed and THANKFULLY Central ...,...,1,19,0.263158,0.684211,facebook.com,2,0.0,4.9454,0.274744,1
5736,rescuers,0.914286,,,,,1,Fears over missing migrants in Med: Rescuers s...,fears over missing migrants in med rescuers se...,Fears over missing migrants in Med: Rescuers s...,...,1,20,0.35,0.6,abouthub.info,2,0.0,9.730616,0.54059,1
4968,meltdown,0.151515,,,,,1,Currently: Uncontrollable meltdown number 2,currently uncontrollable meltdown number,Currently: Uncontrollable meltdown number 2,...,0,5,0.0,1.0,,0,0.0,-1.117501,-0.279375,0
662,blaze,0.131579,,,,,1,looks like a year of writing and computers is ...,looks like a year of writing and computers is ...,looks like a year of writing and computers is ...,...,1,11,0.363636,0.545455,twitter.com,1,0.0,-1.728852,-0.172885,0
6805,tragedy,0.611111,India,India,,,0,Rly tragedy in MP: Some live to recount horror...,rly tragedy in mp some live to recount horror ...,Rly tragedy in MP: Some live to recount horror...,...,0,26,0.269231,0.730769,,0,0.0,7.879189,0.303046,1


In [37]:
categorical_features = [
    'country',
    'state',
]
numerical_features = [
    'text_length', 
    'positive_factor',
    'token_sentiment_2'
]

column_transformer = compose.ColumnTransformer(transformers=[
    ('one_hot', preprocessing.OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ('numerical', preprocessing.StandardScaler(), numerical_features)
], remainder='drop')

transformer = pipeline.Pipeline([
    ('columns', column_transformer)
])

transformer.fit(df_train)
X_train = transformer.transform(df_train)
print('X_train shape', X_train.shape)

Y_train = df_train['target']

print(f'X_train type={type(X_train)}, shape={X_train.shape}')
print(f'Y_train shape={Y_train.shape}')


X_train shape (6851, 169)
X_train type=<class 'numpy.ndarray'>, shape=(6851, 169)
Y_train shape=(6851,)


In [38]:
X_validation = transformer.transform(df_validation)
print('X_validation shape', X_validation.shape)

Y_validation = df_validation['target']
print('Y_validation shape', Y_validation.shape)

X_validation shape (762, 169)
Y_validation shape (762,)


In [39]:
INPUT_SIZE = X_train.shape[1]
OUTPUT_SIZE = 1
NN_SHAPE = [INPUT_SIZE, 32, 32, 16, 1]
BATCH_SIZE= X_train.shape[0]
MAX_EPOCHS = 40


def sigmoid(x):
    return tf.cast(tf.greater(tf.nn.sigmoid(x), .5), tf.int32)


def build_model(layer_dims, use_dropout=False, dropout_rate_1=0.3, dropout_rate_2=0.1,
                learning_rate=1e-3, 
                use_emma=False, emma_momentum=0.99, 
                use_regularizer=False, regularizer=0.01,
                initializer='glorot_normal',
                activation='relu'
                ):
    n_layers = len(layer_dims)
    layers = []

    for l in range(1, n_layers-1):
        layer_kws = {}

        if use_regularizer:
            layer_kws['kernel_regularizer'] = tf.keras.regularizers.l2(regularizer)
        if initializer:
            layer_kws['kernel_initializer'] = initializer
        
        hidden_layer= tf.keras.layers.Dense(layer_dims[l], input_shape=(layer_dims[l-1],), activation=activation, **layer_kws)
        
        layers.append(hidden_layer)
        
        if use_dropout:
            if l==1:
                rate = dropout_rate_1
            elif l==2 and n_layers > 3:
                rate = dropout_rate_2
            else:
                rate = 0.0
            if rate > .0:
                layers.append(tf.keras.layers.Dropout(rate=rate))
    layers.append(tf.keras.layers.Dense(layer_dims[n_layers-1], activation='linear'))
    model = tf.keras.Sequential(layers)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, use_ema=use_emma, ema_momentum=emma_momentum),                 
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics='accuracy')
    return model


In [40]:
def objective(trial):
    use_dropout = False # trial.suggest_categorical('use_dropout', [True, False])
    dropout_rate_1 = .0 # trial.suggest_float('dropout_rate_1', 0.1, 0.4) if use_dropout else .0
    dropout_rate_2 = .0 # trial.suggest_float('dropout_rate_2', 0.0, 0.2) if use_dropout else .0
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    batch_size = trial.suggest_categorical('batch_size', [BATCH_SIZE, BATCH_SIZE//2 + 1, BATCH_SIZE//4 + 1])
    use_emma = False # trial.suggest_categorical('use_emma', [True, False])
    emma_momentum =.0 # trial.suggest_float('emma_momentum', 0.5, 0.9, log=True) if use_emma else 0.999
    regularizer = trial.suggest_float('regularizer', 1e-5, 1e-2, log=True)
    initializer = trial.suggest_categorical('initializer', ['glorot_normal', 'he_normal'])
    activation = trial.suggest_categorical('activation', ['relu', 'gelu', 'leaky_relu'])

    model = build_model(layer_dims=NN_SHAPE, 
                        use_dropout=use_dropout, dropout_rate_1=dropout_rate_1, dropout_rate_2=dropout_rate_2,
                        initializer=initializer,
                        regularizer=regularizer, 
                        activation=activation,
                        learning_rate=learning_rate, use_emma=use_emma, emma_momentum=emma_momentum)
    
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)

    # print(f'---- step {index+1} of {k}')
    # print(f'train size: {len(X_train_set)}, test size: {len(X_test_set)}')
    
    model.fit(X_train, Y_train, batch_size=batch_size, epochs=MAX_EPOCHS, 
            validation_data=(X_validation, Y_validation),
            callbacks=[early_stopping],
            verbose=0)

    Y_predict = sigmoid(model.predict(X_validation))

    f1_score = metrics.f1_score(Y_validation, Y_predict)
    # print(f"Validation F1: {f1_score}")

    print(f"Validation F1 score: {f1_score}")
    # print(f"Standard deviation of cross-validation score: {tf.math.reduce_std(cvscores)}")
    # print(model.summary())
    
    return f1_score

In [41]:
study_name=SCRIPT_NAME
storage=f"sqlite:///{SCRIPT_NAME}.optuna.db"

# recreate study for new NN architecture
try:
    optuna.delete_study(study_name=study_name, storage=storage)
except:
    pass


In [42]:
study = optuna.create_study(study_name=study_name, storage=storage,
                            direction='maximize', 
                            sampler=optuna.samplers.TPESampler(seed=42, consider_prior=True),
                            load_if_exists=True)
study.optimize(objective, n_trials=10, show_progress_bar=True)

# Print optimal hyperparameters and the corresponding score

trial = study.best_trial
print(f'-> Best score: {trial.value}')
print(f'-> Optimal hyperparameters: ')
pprint.pprint(trial.params)


[I 2024-04-08 22:12:07,851] A new study created in RDB with name: DL-07
  0%|          | 0/10 [00:00<?, ?it/s]



Best trial: 0. Best value: 0.856742:  10%|█         | 1/10 [00:01<00:11,  1.32s/it]

Validation F1 score: 0.8567415730337078
[I 2024-04-08 22:12:09,166] Trial 0 finished with value: 0.8567415730337078 and parameters: {'learning_rate': 0.00031489116479568613, 'batch_size': 6851, 'regularizer': 2.9380279387035334e-05, 'initializer': 'glorot_normal', 'activation': 'relu'}. Best is trial 0 with value: 0.8567415730337078.


Best trial: 0. Best value: 0.856742:  20%|██        | 2/10 [00:02<00:10,  1.31s/it]

Validation F1 score: 0.26048565121412803
[I 2024-04-08 22:12:10,464] Trial 1 finished with value: 0.26048565121412803 and parameters: {'learning_rate': 1.2087541473056957e-05, 'batch_size': 6851, 'regularizer': 3.511356313970405e-05, 'initializer': 'he_normal', 'activation': 'relu'}. Best is trial 0 with value: 0.8567415730337078.


Best trial: 2. Best value: 0.909361:  30%|███       | 3/10 [00:03<00:08,  1.20s/it]

Validation F1 score: 0.9093610698365527
[I 2024-04-08 22:12:11,541] Trial 2 finished with value: 0.9093610698365527 and parameters: {'learning_rate': 0.0028016351587162596, 'batch_size': 1713, 'regularizer': 0.00023345864076016249, 'initializer': 'glorot_normal', 'activation': 'gelu'}. Best is trial 2 with value: 0.9093610698365527.


Best trial: 3. Best value: 0.913947:  40%|████      | 4/10 [00:04<00:07,  1.20s/it]

Validation F1 score: 0.913946587537092
[I 2024-04-08 22:12:12,751] Trial 3 finished with value: 0.913946587537092 and parameters: {'learning_rate': 0.0026926469100861782, 'batch_size': 1713, 'regularizer': 0.00788671412999049, 'initializer': 'glorot_normal', 'activation': 'gelu'}. Best is trial 3 with value: 0.913946587537092.


Best trial: 3. Best value: 0.913947:  50%|█████     | 5/10 [00:06<00:06,  1.36s/it]

Validation F1 score: 0.49593495934959353
[I 2024-04-08 22:12:14,382] Trial 4 finished with value: 0.49593495934959353 and parameters: {'learning_rate': 3.077180271250682e-05, 'batch_size': 1713, 'regularizer': 5.9750279999602906e-05, 'initializer': 'glorot_normal', 'activation': 'gelu'}. Best is trial 3 with value: 0.913946587537092.


Best trial: 3. Best value: 0.913947:  60%|██████    | 6/10 [00:07<00:04,  1.15s/it]

Validation F1 score: 0.9112081513828238
[I 2024-04-08 22:12:15,123] Trial 5 finished with value: 0.9112081513828238 and parameters: {'learning_rate': 0.07556810141274425, 'batch_size': 3426, 'regularizer': 0.0006218704727769079, 'initializer': 'glorot_normal', 'activation': 'leaky_relu'}. Best is trial 3 with value: 0.913946587537092.


Best trial: 3. Best value: 0.913947:  70%|███████   | 7/10 [00:08<00:03,  1.22s/it]

Validation F1 score: 0.829971181556196
[I 2024-04-08 22:12:16,497] Trial 6 finished with value: 0.829971181556196 and parameters: {'learning_rate': 0.00035868164986275477, 'batch_size': 3426, 'regularizer': 6.963114377829287e-05, 'initializer': 'glorot_normal', 'activation': 'leaky_relu'}. Best is trial 3 with value: 0.913946587537092.


Best trial: 7. Best value: 0.917031:  80%|████████  | 8/10 [00:09<00:02,  1.11s/it]

Validation F1 score: 0.9170305676855894
[I 2024-04-08 22:12:17,353] Trial 7 finished with value: 0.9170305676855894 and parameters: {'learning_rate': 0.012273800987852962, 'batch_size': 1713, 'regularizer': 0.001319994226153501, 'initializer': 'he_normal', 'activation': 'gelu'}. Best is trial 7 with value: 0.9170305676855894.


Best trial: 7. Best value: 0.917031:  90%|█████████ | 9/10 [00:10<00:01,  1.20s/it]

Validation F1 score: 0.9112426035502958
[I 2024-04-08 22:12:18,767] Trial 8 finished with value: 0.9112426035502958 and parameters: {'learning_rate': 0.028340904295147733, 'batch_size': 6851, 'regularizer': 8.569331925053983e-05, 'initializer': 'he_normal', 'activation': 'gelu'}. Best is trial 7 with value: 0.9170305676855894.


Best trial: 7. Best value: 0.917031: 100%|██████████| 10/10 [00:13<00:00,  1.35s/it]

Validation F1 score: 0.7011494252873564
[I 2024-04-08 22:12:21,385] Trial 9 finished with value: 0.7011494252873564 and parameters: {'learning_rate': 3.0086868214458443e-05, 'batch_size': 3426, 'regularizer': 0.002055424552015075, 'initializer': 'he_normal', 'activation': 'relu'}. Best is trial 7 with value: 0.9170305676855894.
-> Best score: 0.9170305676855894
-> Optimal hyperparameters: 
{'activation': 'gelu',
 'batch_size': 1713,
 'initializer': 'he_normal',
 'learning_rate': 0.012273800987852962,
 'regularizer': 0.001319994226153501}





#### Optimal model parameters

In [43]:
print(f'-> Best score: {trial.value}')
pprint.pprint(trial.params)

-> Best score: 0.9170305676855894
{'activation': 'gelu',
 'batch_size': 1713,
 'initializer': 'he_normal',
 'learning_rate': 0.012273800987852962,
 'regularizer': 0.001319994226153501}


#### Train model with optimal parameters

In [44]:
def train_best_model(best_params):
    batch_size = best_params.pop('batch_size', BATCH_SIZE)
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)
    best_model = build_model(layer_dims=NN_SHAPE, **best_params)
    best_model.fit(X_train, Y_train, batch_size=batch_size, epochs=40, validation_split=0.2,
                callbacks=[early_stopping], verbose=3)
    Y_predict = sigmoid(best_model(X_train))
    f1_score = metrics.f1_score(Y_train, Y_predict)
    print(f'Best model F1={f1_score:.3f}')
    return best_model

best_model = train_best_model(trial.params)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Best model F1=0.918


In [45]:
best_model.summary()

Model: "sequential_33"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_132 (Dense)           (None, 32)                5440      
                                                                 
 dense_133 (Dense)           (None, 32)                1056      
                                                                 
 dense_134 (Dense)           (None, 16)                528       
                                                                 
 dense_135 (Dense)           (None, 1)                 17        
                                                                 
Total params: 7,041
Trainable params: 7,041
Non-trainable params: 0
_________________________________________________________________


In [46]:
best_model.save(SCRIPT_NAME)



INFO:tensorflow:Assets written to: DL-07/assets


INFO:tensorflow:Assets written to: DL-07/assets


Score on validation dataset

In [47]:
Y_validation_predict = sigmoid(best_model.predict(X_validation))
f1_score_test = metrics.f1_score(Y_validation, Y_validation_predict)
print(f'F1 score: {f1_score_test}')

F1 score: 0.9069767441860465


### Generate submission file

In [48]:
df_test = pd.read_csv('./test_enriched.csv', index_col='id')
df_test.fillna({'keyword': '', 'location': '', 'country': '', 'state': '', 'city': '', 'url_domains': ''}, inplace=True)
df_test.shape

(3263, 24)

In [49]:
X_test = transformer.transform(df_test)
print('X_test shape', X_test.shape)

Y_test_predict = sigmoid(best_model(X_test))

df_example = pd.read_csv('./sample_submission.csv')
df_example['target'] = Y_test_predict

df_example.to_csv(f'./{SCRIPT_NAME}-submission.csv', index=False)

X_test shape (3263, 169)
