## Kaggle - Natural Language Processing with Disaster Tweets
Solution by Chris & Naru 

https://www.kaggle.com/learn/natural-language-processing

### Useful Resources
https://pandas.pydata.org/docs/

https://towardsdatascience.com/how-to-combine-textual-and-numerical-features-for-machine-learning-in-python-dc1526ca94d9

In [1]:
import pandas as pd
file_loc = "data\\"

df = pd.read_csv(file_loc + "train.csv")
df_test = pd.read_csv(file_loc + "test.csv", index_col=False)

In [2]:
print(df.columns)
print("\n")
print(df.shape)

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')


(7613, 5)


In [3]:
def output_matrix(pred, actual):  
    """output_matrix computes accuracy information,
    and a confusion matrix"""
    
    df = pd.DataFrame({"pred":pred, "actual":actual, "count":0})
    # Rounds each predicted number to either 0 or 1.
    df.pred = df.pred.apply((lambda x : 0 if 0 >= round(x) else 1))
    
    correct = len(df[df.pred == df.actual])
    incorrect = len(df[df.pred != df.actual])
    accuracy = correct / (correct + incorrect)
    
    print(f"Correct: {correct}\nIncorrect: {incorrect}\nAccuracy: {accuracy}")

    print(df.groupby(["actual","pred"]).count())

In [4]:
# First of 5 disaster tweets and 5 non-disaster tweets
# 0 means non-disaster
# 1 means disaster
sample_df_0 = df[df.target==0].head(5)
sample_df_1 = df[df.target==1].head(5)
sample_df = sample_df_0.append(sample_df_1)

sample_df.reset_index(drop=True)

Unnamed: 0,id,keyword,location,text,target
0,23,,,What's up man?,0
1,24,,,I love fruits,0
2,25,,,Summer is lovely,0
3,26,,,My car is so fast,0
4,28,,,What a goooooooaaaaaal!!!!!!,0
5,1,,,Our Deeds are the Reason of this #earthquake M...,1
6,4,,,Forest fire near La Ronge Sask. Canada,1
7,5,,,All residents asked to 'shelter in place' are ...,1
8,6,,,"13,000 people receive #wildfires evacuation or...",1
9,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
df[["id","target"]].groupby("target").count().rename(columns={"id":"Count"})

Unnamed: 0_level_0,Count
target,Unnamed: 1_level_1
0,4342
1,3271


In [6]:
# Calculating the length of text
df["text_length"]=df["text"].str.len()
df[["target","text_length"]].groupby("target").describe()

Unnamed: 0_level_0,text_length,text_length,text_length,text_length,text_length,text_length,text_length,text_length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,4342.0,95.832105,35.969652,7.0,68.0,101.0,130.0,157.0
1,3271.0,108.236319,29.369513,14.0,88.0,115.0,136.0,151.0


In [7]:
from scipy.stats import ttest_ind
target_0 = df[df.target==0]["text_length"]
target_1 = df[df.target==1]["text_length"]
ttest_ind(target_0, target_1, equal_var=False)



Ttest_indResult(statistic=-16.551040413805307, pvalue=1.8059359097919938e-60)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
wordCounts = {}

for x in df.text:
    for y in x.split(" "):
        
        word = y.lower().replace("?","").replace(".","")
        
        setVal = wordCounts.get(y.lower(),0) + 1
        wordCounts[y.lower()] = setVal

popularWords = list(filter(lambda x: (x[1] > 20), wordCounts.items()))
popularWords = [x[0] for x in popularWords]
len(popularWords)

752

In [10]:
tfVect = TfidfVectorizer(vocabulary = popularWords)


test2 = tfVect.fit_transform(df["text"])
ndf = pd.DataFrame(test2.toarray(), columns = tfVect.get_feature_names())

ndf["text_length"] = df.text_length



test2 = tfVect.fit_transform(df_test["text"])
ndf_test = pd.DataFrame(test2.toarray(), columns = tfVect.get_feature_names())

ndf_test["text_length"] = df.text_length





In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(ndf, df.target, test_size=0.5, random_state=42)

lRegModel = LinearRegression()

lRegModel.fit(x_train, y_train)

predictionResult = lRegModel.predict(x_test)

testPred = lRegModel.predict(ndf_test)

#output_matrix(predictionResult, y_test)


finaldf = pd.DataFrame({"id":df_test.id, "target":testPred})

finaldf.target = finaldf.target.apply((lambda x : 0 if 0 >= round(x) else 1))

finaldf.to_csv("submit_res.csv", index=False)

print(finaldf)

#print(testPred)

         id  target
0         0       1
1         2       0
2         3       1
3         9       0
4        11       1
...     ...     ...
3258  10861       1
3259  10865       1
3260  10868       0
3261  10874       1
3262  10875       0

[3263 rows x 2 columns]


In [12]:
from sklearn.ensemble import RandomForestClassifier

x_train, x_test, y_train, y_test = train_test_split(ndf, df.target, test_size=0.5, random_state=42)

rfcModel = RandomForestClassifier()

rfcModel.fit(x_train, y_train)

predictionResult = rfcModel.predict(x_test)

predictionResult

output_matrix(predictionResult, y_test)

Correct: 2910
Incorrect: 897
Accuracy: 0.764381402679275
             count
actual pred       
0      0      1923
       1       262
1      0       635
       1       987


In [13]:
from sklearn.naive_bayes import MultinomialNB

x_train, x_test, y_train, y_test = train_test_split(ndf, df.target, test_size=0.5, random_state=42)

mnbModel = MultinomialNB()

mnbModel.fit(x_train, y_train)

predictionResult = mnbModel.predict(x_test)

predictionResult

output_matrix(predictionResult, y_test)

Correct: 2971
Incorrect: 836
Accuracy: 0.7804045179931705
             count
actual pred       
0      0      1924
       1       261
1      0       575
       1      1047
