In [1]:
import gc
import re
import string
import operator
from collections import defaultdict

import pandas as pd
import numpy as np

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use("fivethirtyeight")

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
train.shape

(7613, 5)

In [6]:
test.shape

(3263, 4)

# 1. Keyword and Location

In [7]:
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [8]:
test.isna().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [9]:
missing_cols = ['keyword', 'location']

for df in [train, test]:
    for col in ['keyword', 'location']:
        df[col] = df[col].fillna(f"no_{col}")

In [10]:
train.isna().sum()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

In [11]:
print(f'Number of unique values in keyword = {train["keyword"].nunique()} (Training) - {test["keyword"].nunique()} (Test)')
print(f'Number of unique values in location = {train["location"].nunique()} (Training) - {test["location"].nunique()} (Test)')

Number of unique values in keyword = 222 (Training) - 222 (Test)
Number of unique values in location = 3342 (Training) - 1603 (Test)


In [12]:
# train.keyword.value_counts()

In [13]:
train['word_count'] = train['text'].apply(lambda x: len(str(x).split()))
test['word_count'] = test['text'].apply(lambda x: len(str(x).split()))

train['unique_word_count'] = train['text'].apply(lambda x: len(set(str(x).split())))
test['unique_word_count'] = test['text'].apply(lambda x: len(set(str(x).split())))

In [14]:
train.head()

Unnamed: 0,id,keyword,location,text,target,word_count,unique_word_count
0,1,no_keyword,no_location,Our Deeds are the Reason of this #earthquake M...,1,13,13
1,4,no_keyword,no_location,Forest fire near La Ronge Sask. Canada,1,7,7
2,5,no_keyword,no_location,All residents asked to 'shelter in place' are ...,1,22,20
3,6,no_keyword,no_location,"13,000 people receive #wildfires evacuation or...",1,8,8
4,7,no_keyword,no_location,Just got sent this photo from Ruby #Alaska as ...,1,16,15


In [28]:
from wordcloud import STOPWORDS

train["clean_text"] = train.text.apply(lambda x: [w for w in str(x).lower().split() if w not in STOPWORDS and w.isalpha()])
train.head()

Unnamed: 0,id,keyword,location,text,target,word_count,unique_word_count,clean_text
0,1,no_keyword,no_location,Our Deeds are the Reason of this #earthquake M...,1,13,13,"[deeds, reason, may, allah, forgive, us]"
1,4,no_keyword,no_location,Forest fire near La Ronge Sask. Canada,1,7,7,"[forest, fire, near, la, ronge, canada]"
2,5,no_keyword,no_location,All residents asked to 'shelter in place' are ...,1,22,20,"[residents, asked, notified, evacuation, shelt..."
3,6,no_keyword,no_location,"13,000 people receive #wildfires evacuation or...",1,8,8,"[people, receive, evacuation, orders, california]"
4,7,no_keyword,no_location,Just got sent this photo from Ruby #Alaska as ...,1,16,15,"[got, sent, photo, ruby, smoke, pours, school]"


In [29]:
from collections import Counter
count = Counter()
for row in train[train.target == 0].clean_text:
    count.update(row)

In [30]:
count.most_common(15)

[('will', 177),
 ('new', 163),
 ('one', 116),
 ('now', 116),
 ('body', 106),
 ('via', 97),
 ('love', 85),
 ('got', 82),
 ('people', 81),
 ('full', 81),
 ('see', 79),
 ('know', 78),
 ('video', 76),
 ('back', 75),
 ('emergency', 75)]

In [31]:
count_1 = Counter()
for row in train[train.target == 1].clean_text:
    count_1.update(row)

In [33]:
count_1.most_common(20)

[('fire', 151),
 ('via', 117),
 ('suicide', 103),
 ('disaster', 97),
 ('police', 94),
 ('people', 93),
 ('killed', 92),
 ('california', 88),
 ('families', 81),
 ('will', 77),
 ('two', 71),
 ('storm', 71),
 ('train', 71),
 ('bomb', 67),
 ('emergency', 66),
 ('crash', 65),
 ('one', 63),
 ('nuclear', 63),
 ('bombing', 63),
 ('news', 63)]