In [175]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
import spacy
from spacy.tokenizer import Tokenizer
sp = spacy.load('en_core_web_sm')
stemmer = nltk.SnowballStemmer(language='english')

In [184]:

data = pd.read_csv('../data/train.csv')

data.drop(['keyword','keyword','location'], axis=1, inplace=True)
data[:50]

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1
5,8,#RockyFire Update => California Hwy. 20 closed...,1
6,10,#flood #disaster Heavy rain causes flash flood...,1
7,13,I'm on top of the hill and I can see a fire in...,1
8,14,There's an emergency evacuation happening now ...,1
9,15,I'm afraid that the tornado is coming to our a...,1


In [177]:

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
tweet = 'Image prediction: throne Confidence: 2.56% Submission by @Phobiafool'
tweet_cleaned = clean_text(tweet)
print(f'Before cleaning: {tweet}')
print(f'After cleaning: {tweet_cleaned}')


Before cleaning: Image prediction: throne Confidence: 2.56% Submission by @Phobiafool
After cleaning: image prediction throne confidence  submission by phobiafool


In [178]:
data['text'] = data['text'].apply(lambda x: clean_text(x))
data

Unnamed: 0,id,text,target
0,1,our deeds are the reason of this earthquake ma...,1
1,4,forest fire near la ronge sask canada,1
2,5,all residents asked to shelter in place are be...,1
3,6,people receive wildfires evacuation orders in...,1
4,7,just got sent this photo from ruby alaska as s...,1
...,...,...,...
7608,10869,two giant cranes holding a bridge collapse int...,1
7609,10870,ariaahrary thetawniest the out of control wild...,1
7610,10871,s of volcano hawaii,1
7611,10872,police investigating after an ebike collided w...,1


In [179]:
# Load English tokenizer, tagger, parser, NER and word vectors
def remove_stop_words(text):
    all_stopwords = sp.Defaults.stop_words
    tokens_without_sw = ' '.join([word for word in text.split(' ') if not word in all_stopwords and len(word)])
    return tokens_without_sw
test = "Nick likes to play football, however he is not too fond of tennis."
result = remove_stop_words(test)
print(f'Before removing stopwords: {test}')
print(f'After removing stopwords: {result}')

Before removing stopwords: Nick likes to play football, however he is not too fond of tennis.
After removing stopwords: Nick likes play football, fond tennis.


In [180]:
# Stemming 
def stemming(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [181]:
data['text'] = data['text'].apply(remove_stop_words)
data['text'] = data['text'].apply(stemming)
data

Unnamed: 0,id,text,target
0,1,deed reason earthquak allah forgiv,1
1,4,forest fire near la rong sask canada,1
2,5,resid ask shelter place notifi offic evacu she...,1
3,6,peopl receiv wildfir evacu order california,1
4,7,got sent photo rubi alaska smoke wildfir pour ...,1
...,...,...,...
7608,10869,giant crane hold bridg collaps nearbi home,1
7609,10870,ariaahrari thetawniest control wild fire calif...,1
7610,10871,s volcano hawaii,1
7611,10872,polic investig ebik collid car littl portug eb...,1


In [183]:
# Now we need to split the data into train and testing sets.
from sklearn.model_selection import train_test_split

x = data['text']
y = data['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

5709 5709
1904 1904
