# Disaster tweets DL model

In [1]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, ensemble, model_selection, pipeline, compose, preprocessing, metrics
from sklearn.experimental import enable_halving_search_cv
import tensorflow as tf
from embedding_transformer import Doc2VecTransformer

SCRIPT_NAME='DL-02'

2024-03-18 17:59:02.759892: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-18 17:59:02.779487: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df_train = pd.read_csv('./train_enriched.csv', index_col='id')
df_train.fillna({'keyword': '', 'location': '', 'country': '', 'state': '', 'city': ''}, inplace=True)
df_train.head()

Unnamed: 0_level_0,keyword,positive_factor,location,country,state,city,missing_location,text,clean_text,text_length,upper_text_factor,tags_count,punct_factor,ann_count,urls_count,tokens_count,stop_words_factor,clean_tokens_factor,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,,0.5,,,,,1,Our Deeds are the Reason of this #earthquake M...,deed reason earthquake may allah forgive u,57,0.175439,1,0.017544,0,0,13,0.384615,0.615385,1
1,,0.5,,,,,1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada,32,0.15625,0,0.03125,0,0,7,0.0,1.0,1
2,,0.5,,,,,1,All residents asked to 'shelter in place' are ...,resident asked shelter place notified officer ...,112,0.017857,0,0.026786,0,0,22,0.409091,0.590909,1
3,,0.5,,,,,1,"13,000 people receive #wildfires evacuation or...",people receive wildfire evacuation order calif...,57,0.017544,1,0.035088,0,0,9,0.111111,0.888889,1
4,,0.5,,,,,1,Just got sent this photo from Ruby #Alaska as ...,got sent photo ruby alaska smoke wildfire pour...,72,0.041667,2,0.027778,0,0,17,0.352941,0.647059,1


In [18]:
categorical_features = [
    'country',
    'state',
]
numerical_features = [
    'text_length', 
    'urls_count',
    'stop_words_factor',
    'clean_tokens_factor'
]

# vc_text = feature_extraction.text.TfidfVectorizer(max_features=2000)
doc2vec = Doc2VecTransformer(vector_size=2000)

transformer = compose.ColumnTransformer(transformers=[
    # ('text_vector', vc_text, 'clean_text'),
    ('doc2vec', doc2vec, 'clean_text'),
    ('one_hot', preprocessing.OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ('numerical', preprocessing.StandardScaler(), numerical_features)
], remainder='drop')

X_train = transformer.fit_transform(df_train)
print('X_train shape', X_train.shape)

Y_train = df_train['target']

X_train.shape, Y_train.shape


X_train shape (7613, 2177)


((7613, 2177), (7613,))

In [19]:
INPUT_SIZE = X_train.shape[1]
OUTPUT_SIZE = 1
HIDDEN_LAYER_SIZE = 4096
BATCH_SIZE= 64
MAX_EPOCHS = 200

def baseline_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(2*HIDDEN_LAYER_SIZE, input_shape=(INPUT_SIZE,), activation='relu'),
        tf.keras.layers.Dense(2*HIDDEN_LAYER_SIZE, activation='relu'),
        tf.keras.layers.Dense(HIDDEN_LAYER_SIZE, activation='relu'),
        tf.keras.layers.Dense(OUTPUT_SIZE, activation='linear')
    ])
    model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics='accuracy')
    return model

In [20]:
model = baseline_model()
early_stopping = tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)
history = model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=MAX_EPOCHS, validation_split=0.2, 
          callbacks=[early_stopping],
          verbose=2)

Y_predict = tf.cast(tf.greater(tf.nn.sigmoid(model(X_train)), .5), tf.int32)

df_wrong_predictions = pd.DataFrame({'target': tf.squeeze(Y_train), 'predict': tf.squeeze(Y_predict), 'keyword': df_train['keyword'], 'location': df_train['location'], 'text': df_train['text']}).query('target != predict')

f1_score = metrics.f1_score(Y_train, Y_predict)
accuracy = metrics.accuracy_score(Y_train, Y_predict)
precision = metrics.precision_score(Y_train, Y_predict)

print(f'F1={f1_score}, accuracy={accuracy}, precision={precision}')


Epoch 1/200
96/96 - 100s - loss: 0.5626 - accuracy: 0.7374 - val_loss: 0.4741 - val_accuracy: 0.7649 - 100s/epoch - 1s/step
Epoch 2/200
96/96 - 88s - loss: 0.4814 - accuracy: 0.7640 - val_loss: 0.5274 - val_accuracy: 0.7433 - 88s/epoch - 921ms/step
Epoch 3/200
96/96 - 86s - loss: 0.4668 - accuracy: 0.7698 - val_loss: 0.4827 - val_accuracy: 0.7840 - 86s/epoch - 892ms/step
Epoch 4/200
96/96 - 91s - loss: 0.4461 - accuracy: 0.7834 - val_loss: 0.4829 - val_accuracy: 0.7636 - 91s/epoch - 945ms/step
Epoch 5/200
96/96 - 93s - loss: 0.4375 - accuracy: 0.7875 - val_loss: 0.4903 - val_accuracy: 0.7754 - 93s/epoch - 964ms/step
F1=0.7238222074246713, accuracy=0.7820832786023907, precision=0.7945906432748538


In [32]:
with pd.option_context('display.max_colwidth', 200):
    print(df_wrong_predictions.query('target==1').sample(n=10))

      target  predict            keyword                      location  \
id                                                                       
5122       1        0  nuclear%20reactor                                 
4318       1        0           hellfire                                 
1279       1        0             burned                   Oakland, CA   
1624       1        0           collapse                United Kingdom   
6844       1        0             trauma                                 
6030       1        0            seismic                        ??????   
2220       1        0             deluge  Enniscrone & Aughris, Sligo    
317        1        0         armageddon     California, United States   
2009       1        0             damage  261 5th Avenue New York, NY    
3519       1        0         eyewitness                     Stay Fly?   

                                                                                                               

#### Generate output

In [86]:
model.save(SCRIPT_NAME)



INFO:tensorflow:Assets written to: C-DL-1/assets


INFO:tensorflow:Assets written to: C-DL-1/assets


In [87]:
X_test = transformer.transform(pd.read_csv('./test_enriched.csv', index_col='id')).todense()
print('X_test shape', X_test.shape)

Y_test_predict = tf.cast(tf.greater(tf.nn.sigmoid(model(X_test)), .5), tf.int32)

df_example = pd.read_csv('./sample_submission.csv')
df_example['target'] = Y_test_predict

df_example.to_csv(f'./{SCRIPT_NAME}-submission.csv', index=False)

X_test shape (3263, 2172)
