In [71]:
import pandas as pd
import seaborn as sns
import numpy as np
import re

In [72]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [73]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [74]:
df['keyword'].value_counts()

fatalities               45
armageddon               42
deluge                   42
harm                     41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [75]:
test = pd.read_csv('test.csv')
test.head() #preview

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [76]:
test['keyword'].value_counts()

deluged                  23
demolished               22
rubble                   22
first%20responders       21
annihilation             21
                         ..
forest%20fire             5
threat                    5
radiation%20emergency     5
inundation                4
epicentre                 1
Name: keyword, Length: 221, dtype: int64

In [77]:
df[df['keyword'].isnull()]['target'].value_counts(normalize=True)

1    0.688525
0    0.311475
Name: target, dtype: float64

The lack of a keyword indicateds a 69% chance of a tweet being part of the target class

In [78]:
df['keyword'].fillna('nokeyword', inplace=True)

In [79]:
#df['keyword'].str.contains('%20')['target'].value_counts(normalize=True)
df[df['keyword'].str.contains('%20')==True]['target'].value_counts(normalize=True)

1    0.583691
0    0.416309
Name: target, dtype: float64

so the '%20' symbol is 58% / 42% likely to be part of the positive class  

I should probably try to find a way to keep it in as a helpful feature

In [80]:
df['keyword'].str.replace('%20', ' percenttwenty ').value_counts() #preview

nokeyword                            61
fatalities                           45
deluge                               42
armageddon                           42
damage                               41
                                     ..
forest percenttwenty fire            19
epicentre                            12
threat                               11
inundation                           10
radiation percenttwenty emergency     9
Name: keyword, Length: 222, dtype: int64

In [81]:
df['keyword'] = df['keyword'].str.replace('%20', ' percenttwenty ') #make permanent

In [82]:
df['keyword'].value_counts()

nokeyword                            61
fatalities                           45
deluge                               42
armageddon                           42
damage                               41
                                     ..
forest percenttwenty fire            19
epicentre                            12
threat                               11
inundation                           10
radiation percenttwenty emergency     9
Name: keyword, Length: 222, dtype: int64

In [83]:
df.isnull().sum()

id             0
keyword        0
location    2533
text           0
target         0
dtype: int64

In [62]:
df['location'].sample(5)

786     Daruka (near Tamworth) NSW
1361       Spying on your thoughts
1322                           NaN
160                       Thrissur
4465                      Heathrow
Name: location, dtype: object

In [46]:
df[df['location'].isnull()]['target'].value_counts(normalize=True)

0    0.575602
1    0.424398
Name: target, dtype: float64

If no location is given, the tweet has a 58% chance of being in the negative class

In [48]:
df[df['location'].notnull()]['target'].value_counts(normalize=True)

0    0.567717
1    0.432283
Name: target, dtype: float64

If a location is given, the tweet has a 57% chance to be in the negative class. hmm

In [86]:
df['location'].str.contains('#').value_counts()

False    5028
True       52
Name: location, dtype: int64

In [97]:
df['location'] = df['location'].str.replace('#', '')

In [98]:
df['location'].isnull().sum()

2533

In [99]:
df['location'].fillna('nolocgiven', inplace=True) #fill in empty locations with a flag

In [100]:
#d[df['text'].str.contains('#')]
#df[['text'].str.contains('#')==True]['target'].value_counts(normalize=True)
df[df['text'].str.contains('#')==True]['target'].value_counts(normalize=True)

0    0.503123
1    0.496877
Name: target, dtype: float64

The hash symbol is fairly well distributed across both positive and negative class.

I'd like to replace the symbol so that the words behind it are accounted for during vectorization.

In [102]:
df['text'] = df['text'].str.replace('#', '')

In [103]:
#df['text'].str.contains('@').value_counts(normalize=True)
df[df['text'].str.contains('@')==True]['target'].value_counts(normalize=True)

0    0.668465
1    0.331535
Name: target, dtype: float64

Hm, tweets that included the '@' symbol also seem to heavily lean in favor of the negative class.  

I'm going to use the word 'atsymbol' to replace that symbol so again it will be picked up on by the vectorizer.

In [104]:
#confirming new flag does not already exist in dataset
df['text'].str.contains('atsymbol').value_counts()

False    7613
Name: text, dtype: int64

In [105]:
df['text'] = df['text'].str.replace('@', 'atsymbol ')

In [106]:
df.isnull().sum()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

Okay, that's all the null values; time to concatenate them together into one string and then model.

In [107]:
df.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [108]:
df['all_text'] = df['keyword'] + ' ' + df['location'] + ' ' + df['text']

In [109]:
df.head()

Unnamed: 0,id,keyword,location,text,target,all_text
0,1,nokeyword,nolocgiven,Our Deeds are the Reason of this earthquake Ma...,1,nokeyword nolocgiven Our Deeds are the Reason ...
1,4,nokeyword,nolocgiven,Forest fire near La Ronge Sask. Canada,1,nokeyword nolocgiven Forest fire near La Ronge...
2,5,nokeyword,nolocgiven,All residents asked to 'shelter in place' are ...,1,nokeyword nolocgiven All residents asked to 's...
3,6,nokeyword,nolocgiven,"13,000 people receive wildfires evacuation ord...",1,"nokeyword nolocgiven 13,000 people receive wil..."
4,7,nokeyword,nolocgiven,Just got sent this photo from Ruby Alaska as s...,1,nokeyword nolocgiven Just got sent this photo ...


In [125]:
df['all_text'].sample(5)

6023    seismic Somalia Exploration takes seismic shif...
3171    emergency percenttwenty plan Alexandria, VA, U...
4563    injuries Orlando,FL  USA Official kinesiology ...
3385    evacuation ÌÏT: 43.631838,-79.55807 INK Entert...
5372    panic Topeka, KS The good thing is that the Ro...
Name: all_text, dtype: object

In [126]:
df.to_csv('training_clean_2.csv', index=False)