In [6]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade

# This sample uses Keras Core, the multi-backend version of Keras.
# The selected backend is TensorFlow (other supported backends are 'jax' and 'torch')
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'



In [7]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
import keras_core as keras
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Brief description of the problem and data 

We wanto identify tweets by whether they accurately classify a disaster or not. The data are tweets that are either classified as accurately classifies a disaster, (1), or is just a random text (classified as a 0).

In [8]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")


In [9]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [10]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Explore the dataset

In [11]:
train["length"] = train["text"].apply(lambda x : len(x))
test["length"] = test["text"].apply(lambda x : len(x))

print("Train Length Stat")
print(train["length"].describe())
print()

print("Test Length Stat")
print(test["length"].describe())

Train Length Stat
count    7613.000000
mean      101.037436
std        33.781325
min         7.000000
25%        78.000000
50%       107.000000
75%       133.000000
max       157.000000
Name: length, dtype: float64

Test Length Stat
count    3263.000000
mean      102.108183
std        33.972158
min         5.000000
25%        78.000000
50%       109.000000
75%       134.000000
max       151.000000
Name: length, dtype: float64


# Preprocess the data

In [12]:
import nltk
from nltk.corpus import stopwords
import re

# Download necessary NLTK data
nltk.download('stopwords')

# Define a function to preprocess the text
def preprocess_text(text):
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # Remove punctuation and special characters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)

    return text

# Apply preprocessing to the text column
train['text'] = train['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
from sklearn.model_selection import train_test_split
# Ensure labels are numeric
train['target'] = train['target'].astype(int)
print(f"Labels dtype: {train['target'].dtype}")

X = train['text']
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)


Labels dtype: int64


In [14]:
X_train

4665     Bilsko suddenly I m inundated research   humo...
1241    Such beautiful architecture  NYC I love fire e...
1190    Leicester Merc   ICYMI    Ashes 2015  Australi...
3028    1 43 earthquake occurred near Mount St  Helens...
3581        Fatal attraction common n common  pain       
                              ...                        
3150    Emergency root canal    tookitlikeaman  lovemy...
7305     Jennife29916207 I thinking today I reading wi...
6766                                       Ayshun Tornado
2104    http   t co lMA39ZRWoY There way seemeth right...
4676                  Beyond bounds  till inundation rise
Name: text, Length: 5329, dtype: object

## Word Embeddings

In [15]:
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Ensure tokenization is done correctly
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['text'])

# Convert the text to sequences of integers
sequences = tokenizer.texts_to_sequences(train['text'])

# Step 2: Pad sequences
max_sequence_length = max([len(seq) for seq in sequences])
data = pad_sequences(sequences, maxlen=max_sequence_length)


word_index = tokenizer.word_index

# Word2Vec model
w2v_model = Word2Vec(sentences=sequences, vector_size=100, window=5, min_count=1, workers=4)
w2v_model.train(sequences, total_examples=len(train['text']), epochs=10)

# Convert words to vectors
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]


## Model creation

A little bit about the strategy: We're using word2vec and then LSTM to be able to classify these tweets. The first step it to have an embedding layer, and then add an LSTM which is used for longer word sequences so that we still maintain context between the words. After that, we use a sigmooid activation function to classify as either 0 or 1 since its a binary task.

From my research, I found that this model arch is the best because it captures the meaning of the individual words while also maintaining context throughout the training.

In [16]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

max_sequence_length = max([len(seq) for seq in sequences])
data = pad_sequences(sequences, maxlen=max_sequence_length)

model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1,
                    output_dim=100,
                    weights=[embedding_matrix],
                    input_length=max_sequence_length,
                    trainable=False))
model.add(LSTM(128, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [17]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

def create_model(lstm_units=128, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=len(word_index) + 1,
                        output_dim=100,
                        weights=[embedding_matrix],
                        input_length=max_sequence_length,
                        trainable=False))
    model.add(LSTM(lstm_units, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=32, verbose=1)


param_grid = {
    'lstm_units': [64, 128, 256],
    'optimizer': ['adam', 'rmsprop'],
    'batch_size': [32, 64],
    'epochs': [5, 10]
}

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(data, train['target'])

print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")


  model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=32, verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
E

In [22]:
test_sequences = tokenizer.texts_to_sequences(test['text']) 
test_data = pad_sequences(test_sequences, maxlen=max_sequence_length)
predictions = grid_result.best_estimator_.predict(test_data)



# Generate the submission file 

For each tweets in the test set, we predict if the given tweet is about a real disaster or not. If so, predict a 1. If not, predict a 0.

The `submission.csv` file uses the following format:
`id,target`

In [25]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [26]:
sample_submission["target"] = np.argmax(grid_result.best_estimator_.predict(test_data), axis=1)



In [28]:
sample_submission.to_csv("submission.csv", index=False)

## Conclusion
In conclusion, I found that the higher epochs (as expected) resulted in higher accuracy. I feel like if I was not approaching the GPU quota for the week, i would have been able to experiment with more hyperparameter tuning and run for way more epochs. 

In the future, I also feel like I'd explore using upsampling to generate more data, because these sort of models always work better with more data. I would also like to have better GPUs for running more data.