# Disaster tweets DL model

In [1]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, ensemble, model_selection, pipeline, compose, preprocessing, metrics
from sklearn.experimental import enable_halving_search_cv
import tensorflow as tf
from embedding_transformer import Doc2VecTransformer

SCRIPT_NAME='DL-04'

2024-03-21 20:07:44.952996: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-21 20:07:44.972782: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
df_train = pd.read_csv('./train_enriched.csv', index_col='id')
df_train.fillna({'keyword': '', 'location': '', 'country': '', 'state': '', 'city': '', 'url_domains': '', 'clean_text': ''}, inplace=True)
df_train.head()

Unnamed: 0_level_0,keyword,positive_factor,location,country,state,city,missing_location,text,clean_text,text_length,...,punct_factor,ann_count,urls_count,tokens_count,stop_words_factor,clean_tokens_factor,url_domains,url_redirects_count,hashtags_sentiment,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,0.5,,,,,1,Our Deeds are the Reason of this #earthquake M...,deed reason earthquake may allah forgive u,57,...,0.017544,0,0,13,0.384615,0.615385,,0,1.0,1
1,,0.5,,,,,1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada,32,...,0.03125,0,0,7,0.0,1.0,,0,0.0,1
2,,0.5,,,,,1,All residents asked to 'shelter in place' are ...,resident asked shelter place notified officer ...,112,...,0.026786,0,0,22,0.409091,0.590909,,0,0.0,1
3,,0.5,,,,,1,"13,000 people receive #wildfires evacuation or...",people receive wildfire evacuation order calif...,57,...,0.035088,0,0,9,0.111111,0.888889,,0,1.0,1
4,,0.5,,,,,1,Just got sent this photo from Ruby #Alaska as ...,got sent photo ruby alaska smoke wildfire pour...,72,...,0.027778,0,0,17,0.352941,0.647059,,0,0.714286,1


In [47]:
categorical_features = [
    'country',
    'state',
]
numerical_features = [
    'text_length', 
    # 'ann_count',
    # 'url_redirects_count',
    # 'stop_words_factor',
    'positive_factor',
    'hashtags_sentiment'
]

# text_vec = feature_extraction.text.TfidfVectorizer(max_features=2000)
# text_vec = Doc2VecTransformer(vector_size=2000)
text_vec = feature_extraction.text.CountVectorizer(max_features=1000)
# domains_vec = feature_extraction.text.TfidfVectorizer(max_features=100)
domains_vec = feature_extraction.text.CountVectorizer(max_features=100)

transformer = compose.ColumnTransformer(transformers=[
    ('text_vec', text_vec, 'clean_text'),
    ('domains_vec', domains_vec, 'url_domains'),
    ('one_hot', preprocessing.OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ('numerical', preprocessing.StandardScaler(), numerical_features)
], remainder='drop')

X_train = transformer.fit_transform(df_train).todense()
print('X_train shape', X_train.shape)

Y_train = df_train['target']

X_train.shape, Y_train.shape


X_train shape (7613, 1270)


((7613, 1270), (7613,))

In [48]:
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.true_positives = self.add_weight(name='tp', initializer='zeros')
        self.false_positives = self.add_weight(name='fp', initializer='zeros')
        self.false_negatives = self.add_weight(name='fn', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.round(y_pred)
        y_true = tf.cast(y_true, 'float32')  # Convert y_true to float32
        y_pred = tf.cast(y_pred, 'float32')  # Ensure y_pred is also float32

        true_positives = tf.reduce_sum(y_true * y_pred)
        false_positives = tf.reduce_sum((1 - y_true) * y_pred)
        false_negatives = tf.reduce_sum(y_true * (1 - y_pred))

        # Update state
        self.true_positives.assign_add(true_positives)
        self.false_positives.assign_add(false_positives)
        self.false_negatives.assign_add(false_negatives)

    def result(self):
        precision = self.true_positives / (self.true_positives + self.false_positives + tf.keras.backend.epsilon())
        recall = self.true_positives / (self.true_positives + self.false_negatives + tf.keras.backend.epsilon())
        f1 = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())
        return f1

    def reset_states(self):
        self.true_positives.assign(0)
        self.false_positives.assign(0)
        self.false_negatives.assign(0)

In [51]:
INPUT_SIZE = X_train.shape[1]
OUTPUT_SIZE = 1
HIDDEN_LAYER_SIZE = 128
BATCH_SIZE= 2 # int(0.1*X_train.shape[0])
MAX_EPOCHS = 20

def baseline_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(HIDDEN_LAYER_SIZE, input_shape=(INPUT_SIZE,), activation='relu'),
        tf.keras.layers.Dense(int(0.5*HIDDEN_LAYER_SIZE), activation='relu'),
        tf.keras.layers.Dense(OUTPUT_SIZE, activation='linear')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4, use_ema=True, ema_momentum=0.98),                 
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics='accuracy')
    return model

In [52]:
model = baseline_model()
print(model.summary())

early_stopping = tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)
history = model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=MAX_EPOCHS, validation_split=0.2, 
          callbacks=[early_stopping],
          workers=2,
          verbose=2)

Y_predict = tf.cast(tf.greater(tf.nn.sigmoid(model(X_train)), .5), tf.int32)

df_wrong_predictions = pd.DataFrame({'target': tf.squeeze(Y_train), 'predict': tf.squeeze(Y_predict), 'keyword': df_train['keyword'], 'location': df_train['location'], 'text': df_train['text']}).query('target != predict')

f1_score = metrics.f1_score(Y_train, Y_predict)
accuracy = metrics.accuracy_score(Y_train, Y_predict)
precision = metrics.precision_score(Y_train, Y_predict)
recall = metrics.recall_score(Y_train, Y_predict)

print(f'F1={f1_score}, accuracy={accuracy}, precision={precision}, recall={recall}')


Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_48 (Dense)            (None, 128)               162688    
                                                                 
 dense_49 (Dense)            (None, 64)                8256      
                                                                 
 dense_50 (Dense)            (None, 1)                 65        
                                                                 
Total params: 171,009
Trainable params: 171,009
Non-trainable params: 0
_________________________________________________________________


None
Epoch 1/20
3045/3045 - 4s - loss: 0.4742 - accuracy: 0.7568 - val_loss: 0.3893 - val_accuracy: 0.8109 - 4s/epoch - 1ms/step
Epoch 2/20
3045/3045 - 6s - loss: 0.3594 - accuracy: 0.8407 - val_loss: 0.3738 - val_accuracy: 0.8306 - 6s/epoch - 2ms/step
Epoch 3/20
3045/3045 - 9s - loss: 0.3156 - accuracy: 0.8608 - val_loss: 0.3784 - val_accuracy: 0.8365 - 9s/epoch - 3ms/step
Epoch 4/20
3045/3045 - 7s - loss: 0.2793 - accuracy: 0.8803 - val_loss: 0.3903 - val_accuracy: 0.8372 - 7s/epoch - 2ms/step
F1=0.8754931355531008, accuracy=0.8963614869302509, precision=0.9047619047619048, recall=0.8480586976459799


In [9]:
history.history

{'loss': [0.47814974188804626,
  0.3706640899181366,
  0.3284318745136261,
  0.28984159231185913,
  0.2545904517173767,
  0.22112438082695007],
 'accuracy': [0.7435139417648315,
  0.832348108291626,
  0.8538587689399719,
  0.8750410676002502,
  0.8896551728248596,
  0.9100164175033569],
 'val_loss': [0.40050220489501953,
  0.38775569200515747,
  0.38331034779548645,
  0.3898771107196808,
  0.4109799563884735,
  0.43590298295021057],
 'val_accuracy': [0.8220617175102234,
  0.8312541246414185,
  0.829940915107727,
  0.8325672745704651,
  0.8253447413444519,
  0.8240315318107605]}

In [10]:
with pd.option_context('display.max_colwidth', 200):
    print(df_wrong_predictions.query('target==1').sample(n=10))

      target  predict      keyword         location  \
id                                                    
4698       1        0    landslide                    
7231       1        0      weapons                    
7242       1        0      weapons    Hawthorne, NE   
6829       1        0      trapped     876 Jamrock.   
2905       1        0        drown         Portugal   
2085       1        0         dead       Spare 'Oom   
2033       1        0       danger  Lahar & Gwalior   
2554       1        0      destroy  Jerseyville, IL   
300        1        0   apocalypse                    
229        1        0  annihilated                    

                                                                                                                                                 text  
id                                                                                                                                                     
4698                               

#### Generate output

In [11]:
model.save(SCRIPT_NAME)



INFO:tensorflow:Assets written to: DL-04/assets


INFO:tensorflow:Assets written to: DL-04/assets


In [12]:
df_test = pd.read_csv('./test_enriched.csv', index_col='id')
df_test.fillna({'keyword': '', 'location': '', 'country': '', 'state': '', 'city': '', 'url_domains': ''}, inplace=True)
X_test = transformer.transform(df_test).todense()
print('X_test shape', X_test.shape)

Y_test_predict = tf.cast(tf.greater(tf.nn.sigmoid(model(X_test)), .5), tf.int32)

df_example = pd.read_csv('./sample_submission.csv')
df_example['target'] = Y_test_predict

df_example.to_csv(f'./{SCRIPT_NAME}-submission.csv', index=False)

X_test shape (3263, 1273)
