In [61]:
import nltk
import re
import unidecode
import pandas as pd
import numpy as np
import sys
sys.path.append('../')

import warnings
warnings.filterwarnings('ignore')

In [62]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [63]:
train.text[:10]

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
5    #RockyFire Update => California Hwy. 20 closed...
6    #flood #disaster Heavy rain causes flash flood...
7    I'm on top of the hill and I can see a fire in...
8    There's an emergency evacuation happening now ...
9    I'm afraid that the tornado is coming to our a...
Name: text, dtype: object

In [64]:
url = re.compile(' https?://\S+|www\.\S+')

In [65]:
url.sub('', 'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ')

'M1.94 [01:04 UTC]?5km S of Volcano Hawaii.'

In [66]:
url.sub('', 'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ')

'M1.94 [01:04 UTC]?5km S of Volcano Hawaii.'

In [67]:
import string

In [117]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [120]:
f'[{re.escape(string.punctuation)}]'

'[!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~]'

In [121]:
def preprocessing(text):
    text = text.lower()
    text = unidecode.unidecode(text) # Quitamos los acentos
    
    text = re.sub('\[.*?\]', '', text) # Borramos todos los corchetes ejemplo: [hola mama]
    
    text_without_url = re.sub('https?://\S+|www\.\S+', '', text) # Quitamos los enlaces
    
    text_without_tag = re.sub('<.*?>+', '', text_without_url) # Quitamos expresiones de tipo <xxx>
    
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text_without_tag) # Titamos caracteres como ['!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']
    
    text = re.sub('\n', '', text) # Quitamos los saltos de lineas
    text = re.sub('\w*\d\w*', '', text) # Quitamos palabras con numeros ocodigos    
    
    return text

In [79]:
train.text = train.text.apply(preprocessing)
test.text = test.text.apply(preprocessing)

In [83]:
train.text[:10]

0         deeds reason earthquake may allah forgive us
1                forest fire near la ronge sask canada
2    residents asked shelter place notified officer...
3    people receive wildfires evacuation orders cal...
4    got sent photo ruby alaska smoke wildfires pou...
5    rockyfire update california hwy closed directi...
6    flood disaster heavy rain causes flash floodin...
7                           im top hill see fire woods
8    theres emergency evacuation happening building...
9                        im afraid tornado coming area
Name: text, dtype: object

## Tokenization

In [71]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

In [72]:
train.text = train.text.apply(tokenizer.tokenize)
test.text = test.text.apply(tokenizer.tokenize)

In [73]:
train.sample(5)

Unnamed: 0,id,keyword,location,text,target
2867,4121,drought,"Charlotte, NC","[blog, rain, much, needed, as, drought, condit...",1
5387,7687,panic,Toronto,"[tomorrows, going, to, be, a, year, since, i, ...",1
2612,3748,destruction,Patra-Greece.,"[new, ran, report, from, the, frontlines, of, ...",1
2578,3696,destroy,,"[engineermatarai, ate, mataas, kc, ratingbut, ...",0
5110,7289,nuclear%20disaster,Under Santa Barbara Skies,"[years, ago, today, hiroshima, was, the, first...",1


## Remove stopwords

In [74]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [75]:
train.text = train.text.apply(lambda text: [w for w in text if w not in nltk.corpus.stopwords.words('english')])
test.text = test.text.apply(lambda text: [w for w in text if w not in nltk.corpus.stopwords.words('english')])

In [76]:
train.text = train.text.apply(lambda text: ' '.join(text))
test.text = test.text.apply(lambda text: ' '.join(text))

In [77]:
train.text[:10]

0         deeds reason earthquake may allah forgive us
1                forest fire near la ronge sask canada
2    residents asked shelter place notified officer...
3    people receive wildfires evacuation orders cal...
4    got sent photo ruby alaska smoke wildfires pou...
5    rockyfire update california hwy closed directi...
6    flood disaster heavy rain causes flash floodin...
7                           im top hill see fire woods
8    theres emergency evacuation happening building...
9                        im afraid tornado coming area
Name: text, dtype: object