In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import PorterStemmer
import re

In [4]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [5]:
train['text'] = train['text'].str.lower()
train['keyword'] = train['keyword'].str.lower()
train['keyword'].fillna('-', inplace=True)

test['text'] = test['text'].str.lower()
test['keyword'] = test['keyword'].str.lower()
test['keyword'].fillna('-', inplace=True)

In [7]:
# Using this string remover from StackOverflow: 
# https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
train['text_nourl'] = [re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', i) for i in train['text']]
test['text_nourl'] = [re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', i) for i in test['text']]

In [8]:
train['keyword'] = train['keyword'].str.replace('%20', ' ')
test['keyword'] = test['keyword'].str.replace('%20', ' ')

In [9]:
keywords_sorted = sorted(set(train['keyword']))

In [20]:
def fill_keys(df):
    no_key = df.loc[df['keyword'] == '-']
    for index, row in no_key.iterrows():
        for word in keywords_sorted:
            if word in row['text']:
                df.loc[index, 'keyword'] = word
fill_keys(train)
fill_keys(test)

In [21]:
snowstem = SnowballStemmer('english')

train['keywords_stemmed'] = train['keyword'].map(lambda x: snowstem.stem(x))
test['keywords_stemmed'] = test['keyword'].map(lambda x: snowstem.stem(x))


In [22]:
train.head()

Unnamed: 0,id,keyword,location,text,target,text_nourl,keywords_stemmed
0,1,earthquake,,our deeds are the reason of this #earthquake m...,1,our deeds are the reason of this #earthquake m...,earthquak
1,4,forest fire,,forest fire near la ronge sask. canada,1,forest fire near la ronge sask. canada,forest fir
2,5,evacuation,,all residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...,evacu
3,6,wildfire,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...",wildfir
4,7,wildfire,,just got sent this photo from ruby #alaska as ...,1,just got sent this photo from ruby #alaska as ...,wildfir


In [23]:
test.head()

Unnamed: 0,id,keyword,location,text,text_nourl,keywords_stemmed
0,0,crash,,just happened a terrible car crash,just happened a terrible car crash,crash
1,2,earthquake,,"heard about #earthquake is different cities, s...","heard about #earthquake is different cities, s...",earthquak
2,3,forest fire,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are...",forest fir
3,9,wildfire,,apocalypse lighting. #spokane #wildfires,apocalypse lighting. #spokane #wildfires,wildfir
4,11,typhoon,,typhoon soudelor kills 28 in china and taiwan,typhoon soudelor kills 28 in china and taiwan,typhoon


In [27]:
train.to_csv('./data/train_clean.csv', index = False)
test.to_csv('./data/test_clean.csv', index = False)