In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import csv
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB,BernoulliNB, ComplementNB, MultinomialNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier


# EDA

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train_X = train.drop(labels = 'target', axis = 1)
train_y = train.drop(labels = ['id','keyword','location','text'], axis = 1)

In [4]:
print("Dimention of train data : ",train_X.shape)
print("Dimention of train label : ",train_y.shape)
print("Dimention of test data : ",test.shape)

Dimention of train data :  (7613, 4)
Dimention of train label :  (7613, 1)
Dimention of test data :  (3263, 4)


In [5]:
#Null Data Field
for c in train_X.columns:   
    print("Number of Null Cells in ---",c,"----are ",train_X[c].isnull().sum())

Number of Null Cells in --- id ----are  0
Number of Null Cells in --- keyword ----are  61
Number of Null Cells in --- location ----are  2533
Number of Null Cells in --- text ----are  0


In [6]:
Total_data = pd.concat([train_X, test], ignore_index=True)

In [7]:
Total_data

Unnamed: 0,id,keyword,location,text
0,1,,,Our Deeds are the Reason of this #earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...
...,...,...,...,...
10871,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
10872,10865,,,Storm in RI worse than last hurricane. My city...
10873,10868,,,Green Line derailment in Chicago http://t.co/U...
10874,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


# PreProcessing

In [8]:
def tags(data):
    pattern = re.compile(r'[@|#][a-zA-Z]+')
    matches = pattern.findall(data)
    tags = [match[1:].lower() for match in matches]
    return ' '.join(tags)

def striplinks(data):
    cleanr = re.compile(r'https?://\S+|www\.\S+')
    clean = re.sub(cleanr, ' ', data)
    return clean
    
def alphanumeric(data):-*+6
    cleanr = re.compile(r'[^A-Za-z]+')
    clean = re.sub(cleanr, ' ', data)
    return clean


In [9]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Yash
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Yash
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
stop_words = set(stopwords.words('english'))
stemmer  = SnowballStemmer('english')

In [11]:
start = datetime.now()
with open('processed_total_data.csv', 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['tweets'])
    for i in tqdm(range(10876)):
        
        #raw data
        keyword = str(Total_data['keyword'][i])
        location = str(Total_data['location'][i])
        text = str(Total_data['text'][i])
   
        #extract tags 
        tags_str = stemmer.stem(tags(text))
        #print(tags_str)
        #strip links and alphanumberic charcters
        text_sl = striplinks(text)
        text_sl_an = alphanumeric(text_sl)
        #print(text_sl)
        #print(text_sl_an)
        #removing stopwords and stem the text       
        text_words = word_tokenize(text_sl_an.lower())
        text_sl_an_sw_st = ' '.join(stemmer.stem(j) for j in text_words if j not in stop_words )
        #print(text_words)
        #print(text_sl_an_sw_st)
        #assigning weights
        processed_list = []
        if(tags_str != ''):
            processed_list.extend([tags_str,tags_str,tags_str,text_sl_an_sw_st])
        else:
            processed_list.extend([text_sl_an_sw_st])
        if (keyword != 'nan'):
            #print(keyword)
            processed_list.extend([keyword,keyword])
        if (location != 'nan'):
            location_words = word_tokenize(alphanumeric(location))
            location_str = " ".join(location_words)
            processed_list.extend([location_str])
        
        #saving string
        
        processed_string = " ".join(processed_list)

        writer.writerow([processed_string])
print("Time to run this subsection :", datetime.now() - start)

100%|██████████████████████████████████████████████████████████████████████████| 10876/10876 [00:06<00:00, 1600.90it/s]


Time to run this subsection : 0:00:06.801662


In [12]:
Total_data['keyword'][0] == float('nan')

False

In [13]:
str(Total_data['keyword'][0]) == 'nan'

True

In [26]:
print(stop_words)

{"weren't", 'such', "she's", 'these', 'if', 'those', 'and', 'here', 'should', 'its', 'itself', 'i', 'll', 'ours', 'then', 'ma', 'further', 'to', "haven't", 't', 'each', 'hadn', 'my', 'while', 'she', 'into', 'him', "you're", 'me', 'so', "hadn't", "isn't", 'd', 'out', 'myself', 'do', 'himself', "aren't", 'ourselves', 'when', 'why', 'in', 'too', 'doing', 'until', "you'd", "shouldn't", 'isn', 'than', 're', 'her', 'not', "couldn't", 'are', "it's", 'be', 'it', 'just', "needn't", 'yourselves', 'had', 'below', 'their', 'up', 'your', "that'll", 'aren', 'was', 'm', 'only', 'or', 'that', 'did', 'a', "wasn't", "you've", 'by', 'between', 'more', 'we', 'being', 'same', 'herself', 'am', 'down', 'don', 'y', 'wouldn', 'no', 's', 'against', 'very', 'any', 'theirs', 'how', 'ain', 'through', 'hasn', "mightn't", 'haven', 'all', 'which', 'other', "you'll", 'at', 'them', "hasn't", 'an', 'does', 'now', 'his', 'some', 'themselves', 'whom', 'once', 'shouldn', 'will', 'most', 'couldn', 'been', 'the', 'but', 'own

# Machine learning

## Featurizing data with TF-IDF and Sampling Training Data

In [14]:
processed_total_data = pd.read_csv("processed_total_data.csv")
processed_total_data.replace(np.nan, '', regex=True, inplace = True)

In [15]:
processed_total_data

Unnamed: 0,tweets
0,earthquak earthquak earthquak deed reason eart...
1,forest fire near la rong sask canada
2,resid ask shelter place notifi offic evacu she...
3,wildfir wildfir wildfir peopl receiv wildfir e...
4,alaska wildfir alaska wildfir alaska wildfir g...
...,...
10871,earthquak safeti los angel safeti fasten xrwn
10872,storm ri wors last hurrican citi amp other har...
10873,green line derail chicago
10874,meg issu hazard weather outlook hwo


In [51]:
start = datetime.now()
#smooth_idf=True by default so smoothing is done by defult.
#norm is l2 by default.
#subliner is used False by default.
vectorizer = TfidfVectorizer(min_df = 0.0005, 
                             max_features = 100000, 
                             tokenizer = lambda x: x.split(),
                             ngram_range = (1,4))

train_tfidf = processed_total_data['tweets'][0:7613]
X_train, X_test, y_train, y_test = train_test_split(train_tfidf, train_y, 
                                                    test_size = 0.2, random_state = 42)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
print("Time to run this Subsection ", datetime.now() - start)

Time to run this Subsection  0:00:00.897227


In [52]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6090, 6773) (1523, 6773) (6090, 1) (1523, 1)


## Sampling Training Data

## Supervised Learning Algorithms

### Logistic Regression

Model 1: C=1;penalty='l2'

In [41]:
Model1 = LogisticRegression(C=1,penalty='l2')
Model1.fit(X_train, y_train)
y_pred = Model1.predict(X_test)

print("Acuracy", accuracy_score(y_test['target'], y_pred))
print("Macro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='macro'))
print("Micro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='micro'))
print("Weighted precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='weighted'))

Acuracy 0.7905449770190414
Macro precision_recall_fscore_support
(0.7882430331300083, 0.7804173292479542, 0.7832177442708563, None)
Micro precision_recall_fscore_support
(0.7905449770190414, 0.7905449770190414, 0.7905449770190414, None)
Weighted precision_recall_fscore_support
(0.789789114846523, 0.7905449770190414, 0.7891056991577907, None)


### Naive Bayes 

Model 2.1: Gaussian Naive Bayes
Model 2.2: Bernoulli Naive Bayes
Model 2.3: Complement Naive Bayes
Model 2.4: Multinomial Naive Bayes

In [42]:
Model21 = GaussianNB()
Model21.fit(X_train.toarray(), y_train)
y_pred = Model21.predict(X_test.toarray())

print("Acuracy", accuracy_score(y_test['target'], y_pred))
print("Macro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='macro'))
print("Micro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='micro'))
print("Weighted precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='weighted'))

Acuracy 0.7386736703873933
Macro precision_recall_fscore_support
(0.7874817830041712, 0.7017044352691872, 0.7012559832193861, None)
Micro precision_recall_fscore_support
(0.7386736703873933, 0.7386736703873933, 0.7386736703873934, None)
Weighted precision_recall_fscore_support
(0.7746074273901324, 0.7386736703873933, 0.7168756114435079, None)


In [43]:
Model22 = BernoulliNB()
Model22.fit(X_train.toarray(), y_train)
y_pred = Model22.predict(X_test.toarray())

print("Acuracy", accuracy_score(y_test['target'], y_pred))
print("Macro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='macro'))
print("Micro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='micro'))
print("Weighted precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='weighted'))

Acuracy 0.793827971109652
Macro precision_recall_fscore_support
(0.8218583808292901, 0.7678077168535998, 0.7749594362103822, None)
Micro precision_recall_fscore_support
(0.793827971109652, 0.793827971109652, 0.793827971109652, None)
Weighted precision_recall_fscore_support
(0.8122589254458524, 0.793827971109652, 0.7845862397304179, None)


In [44]:
Model23 = ComplementNB()
Model23.fit(X_train.toarray(), y_train)
y_pred = Model23.predict(X_test.toarray())

print("Acuracy", accuracy_score(y_test['target'], y_pred))
print("Macro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='macro'))
print("Micro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='micro'))
print("Weighted precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='weighted'))

Acuracy 0.7826657912015759
Macro precision_recall_fscore_support
(0.7786407766990291, 0.7747423425583453, 0.7763466878557299, None)
Micro precision_recall_fscore_support
(0.7826657912015759, 0.7826657912015759, 0.7826657912015759, None)
Weighted precision_recall_fscore_support
(0.7817962758734995, 0.7826657912015759, 0.7819005872807898, None)


In [45]:
Model24= MultinomialNB()
Model24.fit(X_train.toarray(), y_train)
y_pred = Model24.predict(X_test.toarray())

print("Acuracy", accuracy_score(y_test['target'], y_pred))
print("Macro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='macro'))
print("Micro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='micro'))
print("Weighted precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='weighted'))

Acuracy 0.7898883782009193
Macro precision_recall_fscore_support
(0.7939205039918271, 0.7738952375243731, 0.7789357563793653, None)
Micro precision_recall_fscore_support
(0.7898883782009193, 0.7898883782009193, 0.7898883782009194, None)
Weighted precision_recall_fscore_support
(0.7919177925062768, 0.7898883782009193, 0.7862051956414586, None)


### Support Vector Machines

In [46]:
Model31 = svm.SVC(kernel='rbf') #rbf by default svm.SVC()
Model31.fit(X_train, y_train)
y_pred = Model31.predict(X_test)

print("Acuracy", accuracy_score(y_test['target'], y_pred))
print("Macro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='macro'))
print("Micro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='micro'))
print("Weighted precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='weighted'))

Acuracy 0.5738673670387393
Macro precision_recall_fscore_support
(0.28693368351936965, 0.5, 0.3646224447225699, None)
Micro precision_recall_fscore_support
(0.5738673670387393, 0.5738673670387393, 0.5738673670387393, None)
Weighted precision_recall_fscore_support
(0.32932375495197513, 0.5738673670387393, 0.4184898446323389, None)


In [47]:
Model32 = svm.SVC(kernel='linear')
Model32.fit(X_train, y_train)
y_pred = Model32.predict(X_test)

print("Acuracy", accuracy_score(y_test['target'], y_pred))
print("Macro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='macro'))
print("Micro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='micro'))
print("Weighted precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='weighted'))

Acuracy 0.7898883782009193
Macro precision_recall_fscore_support
(0.7876262488893724, 0.7796469132233008, 0.7824837273405671, None)
Micro precision_recall_fscore_support
(0.7898883782009193, 0.7898883782009193, 0.7898883782009194, None)
Weighted precision_recall_fscore_support
(0.789136572613699, 0.7898883782009193, 0.7884127182429842, None)


### RandomForestClassifier

Decision Tree

In [48]:
Model41 = RandomForestClassifier(random_state=0)
Model41.fit(X_train, y_train)
y_pred = Model41.predict(X_test)

print("Acuracy", accuracy_score(y_test['target'], y_pred))
print("Macro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='macro'))
print("Micro precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='micro'))
print("Weighted precision_recall_fscore_support")
print(precision_recall_fscore_support(y_test['target'], y_pred, average='weighted'))

Acuracy 0.7642810242941562
Macro precision_recall_fscore_support
(0.764564844912849, 0.7488073536826592, 0.7528161866592522, None)
Micro precision_recall_fscore_support
(0.7642810242941562, 0.7642810242941562, 0.7642810242941562, None)
Weighted precision_recall_fscore_support
(0.7644166786965941, 0.7642810242941562, 0.7606807856466101, None)


# Deep Learning Models