In [26]:
import numpy as np 
import pandas as pd 
import re 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [27]:
train = pd.read_csv('./data/disaster/train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [28]:
test = pd.read_csv('./data/disaster/test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [29]:
sample_submission = pd.read_csv('./data/disaster/sample_submission.csv')
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [30]:
train = train.drop(['id', 'location', 'keyword'], axis=1)

In [31]:
train = train.dropna(subset=['text', 'target'])

In [32]:
train.drop_duplicates(subset=['text'], inplace=True)

In [33]:
train.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [36]:
import nltk
nltk.download('words')
english_words = set(nltk.corpus.words.words())
for i in range(len(train)):
    for j in range(len(train.columns)):

        if isinstance(train.iloc[i, j], str):
            words = train.iloc[i, j].split()
            english_words_only = [word for word in words if word.lower() in english_words]
            train.iloc[i, j] = ' '.join(english_words_only)

[nltk_data] Downloading package words to /Users/faa/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [37]:
# lemmatization 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


# Define a function to remove stopwords and lemmatize the text
def process_text(text):
    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)
    
    # Remove stopwords from the text
    words = [word for word in words if word.lower() not in stopwords.words('english')]
    
    # Lemmatize the words using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join the words back into a string and return it
    return ' '.join(words)

# Apply the process_text function to each cell of the 'text' column
train['text'] = train['text'].apply(process_text)
train.head()

[nltk_data] Downloading package punkt to /Users/faa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/faa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/faa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/faa/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,text,target
0,Reason May Forgive u,1
1,Forest fire near La Canada,1
2,notified evacuation shelter place,1
3,people receive evacuation,1
4,got sent photo Ruby smoke school,1


In [38]:
import re

# Define a function to remove symbols
def process_text(text):
    # Remove all non-alphanumeric characters using regular expressions
    text = re.sub(r'\W+', ' ', text)
    
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Return the processed text
    return text.strip()

# Apply the process_text function to each cell of the 'text' column
train['text'] = train['text'].apply(process_text)

train.head()

Unnamed: 0,text,target
0,Reason May Forgive u,1
1,Forest fire near La Canada,1
2,notified evacuation shelter place,1
3,people receive evacuation,1
4,got sent photo Ruby smoke school,1


In [39]:
train['text'] = train['text'].str.lower()

In [40]:
train.shape

(7503, 2)

In [41]:
from sklearn.model_selection import train_test_split

# Split the DataFrame into a train set (80%) and a temporary set (20%)
train_df, temp_df = train_test_split(train, test_size=0.2, random_state=42)

# Split the temporary set into a validation set (50%) and a train set (50%)
test_df, val_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Print the number of samples in each set
print('Train set:', len(train_df))
print('Validation set:', len(val_df))
print('Test set:', len(test_df))


Train set: 6002
Validation set: 751
Test set: 750


Bag of words

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object to convert the text data to bag-of-words features
vectorizer = CountVectorizer()

# Fit the vectorizer to the train data and transform the train, validation, and test data into bag-of-words features
X_train = vectorizer.fit_transform(train_df['text'])
X_val = vectorizer.transform(val_df['text'])
X_test = vectorizer.transform(test_df['text'])

# Define the target variable
y_train = train_df['target']
y_val = val_df['target']
y_test = test_df['target']

In [47]:
from sklearn.tree import DecisionTreeClassifier

# Create a DecisionTreeClassifier object and fit it to the train data
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the validation data and compute the accuracy
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print('Validation accuracy:', accuracy)

Validation accuracy: 0.7456724367509987


In [49]:
from sklearn.ensemble import RandomForestClassifier

# Create a RandomForestClassifier object and fit it to the train data
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the validation data and compute the accuracy
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print('Validation accuracy:', accuracy)

# Make predictions on the test data and compute the accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Test accuracy:', accuracy)


Validation accuracy: 0.7536617842876165
Test accuracy: 0.7306666666666667


In [62]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score


# Create an XGBClassifier object and fit it to the train data
clf = XGBClassifier(n_estimators=300, learning_rate=0.15, max_depth=15, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the validation data and compute the F1 score
y_pred = clf.predict(X_val)
f1 = f1_score(y_val, y_pred)
print('Validation F1 score:', f1)


Validation F1 score: 0.6939501779359432


In [63]:
import gensim.downloader as api
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

# Load the pre-trained GloVe word embeddings
wv = api.load('glove-twitter-100')

# Split the DataFrame into a train set, a validation set, and a test set
train_df, val_df, test_df = np.split(train.sample(frac=1, random_state=42), [int(.8*len(train)), int(.9*len(train))])

len(train_df), len(val_df), len(test_df)



(6002, 750, 751)

In [65]:

# Convert the text data to sentence vectors by averaging the word embeddings
def convert_text_to_sentence_vector(text):
    words = text.lower().split()
    vec = np.zeros(100)
    count = 0
    for word in words:
        if word in wv.key_to_index:
            vec += wv.get_vector(word)
            count += 1
    if count > 0:
        return vec / count
    else:
        return vec



X_train = np.array([convert_text_to_sentence_vector(text) for text in train_df['text']])
X_val = np.array([convert_text_to_sentence_vector(text) for text in val_df['text']])

# Define the target variable
y_train = train_df['target']
y_val = val_df['target']

# Create a flat version of the input data for use with XGBoost
X_train_flat = X_train.reshape((X_train.shape[0], -1))
X_val_flat = X_val.reshape((X_val.shape[0], -1))

# Create an XGBClassifier object and fit it to the train data
clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=10, random_state=42)
clf.fit(X_train_flat, y_train)

# Make predictions on the validation data and compute the F1 score
y_pred = clf.predict(X_val_flat)
f1 = f1_score(y_val, y_pred)
print('Validation F1 score:', f1)


Validation F1 score: 0.7120954003407155


In [66]:
from sklearn.metrics.pairwise import cosine_distances

# Convert the text data to sentence vectors by averaging the word embeddings
def convert_text_to_sentence_vector(text):
    words = text.lower().split()
    vec = np.zeros(100)
    count = 0
    for word in words:
        if word in wv.key_to_index:
            vec += wv.get_vector(word)
            count += 1
    if count > 0:
        return list(vec / count)
    else:
        return list(vec)

# Convert the train, validation, and test data to sentence vectors
X_train = [convert_text_to_sentence_vector(text) for text in train_df['text']]
X_val = [convert_text_to_sentence_vector(text) for text in val_df['text']]
X_test = [convert_text_to_sentence_vector(text) for text in test_df['text']]

# Define the target variable
y_train = train_df['target']
y_val = val_df['target']
y_test = test_df['target']

# Compute the cosine distance between each pair of sentence vectors in the train data
dist_train = cosine_distances(X_train)

# Create a function to predict the label of a tweet based on the similarity to the train data
def predict_label(text, X_train, y_train, dist_train):
    vec = convert_text_to_sentence_vector(text)
    if vec == [0]*100:
        return 0
    dist = cosine_distances([vec], X_train)[0]
    idx = np.argmin(dist)
    return y_train.iloc[idx]

# Make predictions on the validation data and compute the F1 score
y_pred = [predict_label(text, X_train, y_train, dist_train) for text in val_df['text']]
f1 = f1_score(y_val, y_pred)
print('Validation F1 score:', f1)

# Make predictions on the test data and compute the F1 score
y_pred = [predict_label(text, X_train, y_train, dist_train) for text in test_df['text']]
f1 = f1_score(y_test, y_pred)
print('Test F1 score:', f1)


Validation F1 score: 0.7069767441860466
Test F1 score: 0.7151335311572701
