<a href="https://www.kaggle.com/code/dendyandra/disaster-tweet-classification-with-cnn?scriptVersionId=137429037" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Disaster Tweet Classification

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset

In [None]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# train_df = pd.read_csv('train.csv')
# test_df = pd.read_csv('test.csv')

# EDA

## General Info About Dataset

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
train_df.head()

In [None]:
print('----- Train Dataset Info -----')
print(train_df.info())
print('----- Test Dataset Info -----')
print(test_df.info())

## Data Distribution

In [None]:
f, ax = plt.subplots(2, 2, figsize=(16,14))

# target distribution in train df with pie chart
train_df['target'].value_counts().plot(kind='pie', ax=ax[0,0], labels=['Not Disaster', 'Disaster'], autopct='%1.1f%%', colors=['#7797ec','#ec9777'])
ax[0,0].set_title('Percentage of Target Data')

# target distribution in train df with bar chart
train_df['target'].value_counts().plot(kind='bar', ax=ax[0,1], color=['#7797ec','#ec9777'], rot=0)
# annotate bar label
ax[0,1].bar_label(ax[0,1].containers[0], label_type='edge')
ax[0,1].set_title('Amount of Target Data')
ax[0,1].set_ylabel('num of data')
ax[0,1].set_xlabel('target')

# keyword distribution in train df with bar chart
train_df['keyword'].value_counts()[:10].sort_values(ascending=True).plot(kind='barh', ax=ax[1,0], color='#6357d3')
# annotate bar label
ax[1,0].bar_label(ax[1,0].containers[0], label_type='edge')
ax[1,0].set_title('Top 10 Keyword')
ax[1,0].set_ylabel('keyword')
ax[1,0].set_xlabel('num of data')


# location distribution in train df with bar chart
train_df['location'].value_counts()[:10].sort_values(ascending=True).plot(kind='barh', ax=ax[1,1], color='#dbe246')
# annotate bar label
ax[1,1].bar_label(ax[1,1].containers[0], label_type='edge')
ax[1,1].set_title('Top 10 Location')
ax[1,1].set_ylabel('location')
ax[1,1].set_xlabel('num of data')

plt.suptitle('Data Distribution in Train Dataset')
plt.tight_layout()
plt.show()

## Wordcloud Preview

In [None]:
from wordcloud import WordCloud

In [None]:
def generate_wordcloud(text_list):
    tmp = ''
    # iterate through the csv file
    for val in text_list:
    # for val in tmp3:
        # typecaste each val to string
        val = str(val)
    
        # split the value
        tokens = val.split()
        
        # Converts each token into lowercase
        for i in range(len(tokens)):
            tokens[i] = tokens[i].lower()
        
        tmp += " ".join(tokens)+" "
    
    wordcloud = WordCloud(max_words=50,width = 800, height = 800,
                    background_color ='white',
                    min_font_size = 10).generate(tmp)
    return wordcloud

In [None]:
f, ax = plt.subplots(1,3, figsize=(16,8))
f.suptitle('Wordcloud Visualization', y=0.85, fontsize=14)

# general wordcloud
ax[0].set_title('General Wordcloud')
wordcloud = generate_wordcloud(train_df.text)
ax[0].imshow(wordcloud)
ax[0].axis('off')

# non-disaster wordcloud
ax[1].set_title('Not Disaster Wordcloud')
wordcloud = generate_wordcloud(train_df[train_df.target==0].text)
ax[1].imshow(wordcloud)
ax[1].axis('off')

# disaster wordcloud
ax[2].set_title('Disaster Wordcloud')
wordcloud = generate_wordcloud(train_df[train_df.target==1].text)
ax[2].imshow(wordcloud)
ax[2].axis('off')

plt.tight_layout(pad = 2)
plt.show()

# Preprocessing Data

## Handling Missing Data

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy='most_frequent')
# fill empty value in keyword and locaiton
train_df['keyword'] = imputer.fit_transform(train_df['keyword'].values.reshape(-1, 1)).flatten()
train_df['location'] = imputer.fit_transform(train_df['location'].values.reshape(-1, 1)).flatten()

test_df['keyword'] = imputer.fit_transform(test_df['keyword'].values.reshape(-1, 1)).flatten()
test_df['location'] = imputer.fit_transform(test_df['location'].values.reshape(-1, 1)).flatten()

print(train_df.isna().sum())
print(test_df.isna().sum())


## Cleaning Text

In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', ' ', text)
    text =  re.sub(r'[^a-z0-9 ]', ' ', text)
    text = re.sub(' +', ' ', text)

    return text.strip()

def stopword_removal(text, stopword_list):
    text = text.split()
    res = [word for word in text if word not in stopword_list]

    return ' '.join(res).strip()


In [None]:
# train_df.text.iloc[7459]

In [None]:
# tmp = clean_text(train_df.text.iloc[7459])
# stopword_removal(tmp, stopword_list)

In [None]:
custom_stopword = [
    'w',
    'rt',
    'amp',
    'u',
    'via',
    'im',
    'm',
    's',
    'd',
]

In [None]:
stopword_list = []
# stopword_list += list(stopwords.words('english'))
stopword_list += custom_stopword

In [None]:
train_df['clean_text'] = train_df.text.apply(lambda x: clean_text(x))
train_df['clean_text'] = train_df.clean_text.apply(lambda x: stopword_removal(x, stopword_list))

test_df['clean_text'] = test_df.text.apply(lambda x: clean_text(x))
test_df['clean_text'] = test_df.clean_text.apply(lambda x: stopword_removal(x, stopword_list))

In [None]:
f, ax = plt.subplots(1,2, figsize=(12,10))
f.suptitle('Wordcloud Visualization After Preprocessing Text', y=0.84, fontsize=14)

# non-disaster wordcloud
ax[0].set_title('Not Disaster Wordcloud')
wordcloud = generate_wordcloud(train_df[train_df.target==0].clean_text)
ax[0].imshow(wordcloud)
ax[0].axis('off')

# disaster wordcloud
ax[1].set_title('Disaster Wordcloud')
wordcloud = generate_wordcloud(train_df[train_df.target==1].clean_text)
ax[1].imshow(wordcloud)
ax[1].axis('off')

plt.tight_layout(pad = 2)
plt.show()

## Split Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train_df.clean_text
y = train_df.target

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, stratify=y, random_state=1)

X_test = test_df.clean_text

In [None]:
print(y_train.value_counts())
print(y_val.value_counts())

## BERT Sentences Transformer

In [None]:
%pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer

In [None]:
transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
X_train_e = transformer.encode(X_train.tolist())
X_val_e = transformer.encode(X_val.tolist())

In [None]:
X_train_e.shape

# Model Training

## Build Machine Learning Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train_e, y_train)

svm_model = SVC()
svm_model.fit(X_train_e, y_train)

rf_model = RandomForestClassifier()
rf_model.fit(X_train_e, y_train)

### Eval Model

In [None]:
from sklearn.metrics import f1_score

lr_pred = lr_model.predict(X_val_e)
print(f"Logistic Regression F1 Score: {f1_score(y_val, lr_pred):.3f}")

svm_pred = svm_model.predict(X_val_e)
print(f"Support Vector Machine F1 Score: {f1_score(y_val, svm_pred):.3f}")

rf_pred = rf_model.predict(X_val_e)
print(f"Random Forest F1 Score: {f1_score(y_val, rf_pred):.3f}")

## Build Neural Network Model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, MaxPooling1D, Conv1D, LSTM, Embedding, Bidirectional, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.metrics import AUC, Precision, Recall
from keras import backend as K

In [None]:
# EMBEDDING_DIM = X_train_e.shape[1] #gensim_weight_matrix.shape[1]
# class_num = 1
# # num_words = tokenizer.num_words #len(tokenizer.word_index)

# print(EMBEDDING_DIM)
# # print(num_words)
# # print(len(tokenizer.word_index))

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
print(X_train_e.shape)

class_num = 1
EMBEDDING_DIM = 384
input_length = X_train_e.shape[1]

print(EMBEDDING_DIM)
print(input_length)


### CNN Model

In [None]:
model = Sequential()
model.add(tf.keras.layers.Reshape((EMBEDDING_DIM, 1), input_shape=(EMBEDDING_DIM,)))
# model.add(Embedding(
#         input_dim = vocab_size,
#         output_dim = EMBEDDING_DIM,
#         input_length = input_length,
#         # weights = [gensim_weight_matrix],
#         # trainable = False
#     ))
model.add(Conv1D(activation='relu',
    filters=64, 
    kernel_size=3, 
    strides=1,
    padding='same'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.4))
model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.4))
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dense(1, activation='sigmoid'))
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy']
)
model.summary()

In [None]:
#EarlyStopping and ModelCheckpoint
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_lr=0.00001)
mc = ModelCheckpoint('./model/model.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)

In [None]:
with tf.device('/GPU:0'):   
    history_embedding = model.fit(X_train_e, y_train,
                                    epochs=20, validation_data=(X_val_e, y_val),
                                    batch_size=8, callbacks=[es, mc, rlr])

## Model Evaluate

In [None]:
plt.plot(history_embedding.history['loss'],c='b',label='train loss')
plt.plot(history_embedding.history['val_loss'],c='r',label='validation loss')
plt.legend(loc='upper right')
plt.show()

In [None]:
plt.plot(history_embedding.history['accuracy'],c='b',label='train accuracy')
plt.plot(history_embedding.history['val_accuracy'],c='r',label='validation accuracy')
plt.legend(loc='lower right')
plt.show()

In [None]:
def load_model(modelname):
    with open(f'./model/{modelname}' , 'rb') as f:
        model = pickle.load(f)
    return model

In [None]:
from sklearn.metrics import classification_report

In [None]:
opt_model = tf.keras.models.load_model('/kaggle/working/model/model.h5')
# opt_model = tf.keras.models.load_model('./model/model.h5')
loss, accuracy = opt_model.evaluate(X_val_e, y_val)
print("Test Loss: ", loss)
# print("Test F1: ", f1)
print(f'Test Accuracy: {round(accuracy*100, 2)}%')

In [None]:
y_pred = opt_model.predict(X_val_e)
# convert probability in result to actual class
y_pred = tf.greater(y_pred, 0.5)
y_pred = tf.cast(y_pred, tf.int32)
y_pred = pd.DataFrame(y_pred.numpy()).loc[:,0]
print(classification_report(y_val, y_pred))

# Submission

In [None]:
X_test

In [None]:
X_test_e = transformer.encode(X_test.to_list())
y_pred = opt_model.predict(X_test_e)
# convert probability in result to actual class
y_pred = tf.greater(y_pred, 0.5)
y_pred = tf.cast(y_pred, tf.int32)
y_pred = pd.DataFrame(y_pred.numpy(), columns=['target'])
submission_df = pd.concat([pd.DataFrame({'id': test_df['id']}), y_pred], axis=1)
submission_df

In [None]:
submission_df.to_csv('submission.csv', index=False)