# Ejemplo sobre limpieza de texto

Fuentes: Medium, libro Natural Language Processing with Python y codigo propio del curso.

Datos de Kaggle: [NLP with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started/data)

In [1]:
import pandas as pd

In [2]:
import re

In [3]:
import nltk.corpus

In [4]:
from nltk.corpus import stopwords

In [5]:
from nltk.stem import PorterStemmer

In [6]:
from nltk.tokenize import word_tokenize

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [8]:
from nltk.stem import WordNetLemmatizer

In [9]:
train_data = pd.read_csv('./data/train.csv')

In [10]:
cols_to_drop = ['id', 'keyword', 'location']

In [11]:
train_data = train_data.drop(cols_to_drop, axis=1)

In [12]:
train_data.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


##### Normalization

In [13]:
def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    
    return df

In [14]:
data_clean = clean_text(train_data, 'text', 'text_clean')

In [15]:
data_clean.head()

Unnamed: 0,text,target,text_clean
0,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...
3,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...
4,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...


#### Stop words

In [16]:
stop = stopwords.words('english')

In [17]:
data_clean['text_clean'] = data_clean['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [18]:
data_clean.head()

Unnamed: 0,text,target,text_clean
0,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...
3,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...


#### Stemming

In [19]:
word_list = ['rains', 'raining', 'rain', 'rained']

In [20]:
ps = PorterStemmer()

In [21]:
for w in word_list:
    print(ps.stem(w))

rain
rain
rain
rain


In [22]:
data_clean['text_tokens'] = data_clean['text_clean'].apply(lambda x: word_tokenize(x))

In [23]:
data_clean.head()

Unnamed: 0,text,target,text_clean,text_tokens
0,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us,"[deeds, reason, earthquake, may, allah, forgiv..."
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]"
2,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...,"[residents, asked, shelter, place, notified, o..."
3,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...,"[people, receive, wildfires, evacuation, order..."
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [24]:
def word_stemmer(text):
    stem_text = [PorterStemmer().stem(i) for i in text]
    return stem_text

In [25]:
data_clean['text_tokens_stem'] = data_clean['text_tokens'].apply(lambda x: word_stemmer(x))

In [26]:
data_clean.head()

Unnamed: 0,text,target,text_clean,text_tokens,text_tokens_stem
0,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us,"[deeds, reason, earthquake, may, allah, forgiv...","[deed, reason, earthquak, may, allah, forgiv, us]"
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, rong, sask, canada]"
2,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...,"[residents, asked, shelter, place, notified, o...","[resid, ask, shelter, place, notifi, offic, ev..."
3,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...,"[people, receive, wildfires, evacuation, order...","[peopl, receiv, wildfir, evacu, order, califor..."
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...,"[got, sent, photo, ruby, alaska, smoke, wildfi...","[got, sent, photo, rubi, alaska, smoke, wildfi..."


#### Lemmatization

La lematización, es una herramienta que realiza un análisis morfológico completo para encontrar con mayor precisión la raíz o "lema" de una palabra.

In [None]:
nltk.download('stopwords')

In [27]:
def word_lemmatizer(text):
    lem_text = [WordNetLemmatizer().lemmatize(i) for i in text]
    return lem_text

In [28]:
data_clean['text_tokens_lemma'] = data_clean['text_tokens'].apply(lambda x: word_lemmatizer(x))

In [29]:
data_clean.head()

Unnamed: 0,text,target,text_clean,text_tokens,text_tokens_stem,text_tokens_lemma
0,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us,"[deeds, reason, earthquake, may, allah, forgiv...","[deed, reason, earthquak, may, allah, forgiv, us]","[deed, reason, earthquake, may, allah, forgive..."
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, rong, sask, canada]","[forest, fire, near, la, ronge, sask, canada]"
2,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...,"[residents, asked, shelter, place, notified, o...","[resid, ask, shelter, place, notifi, offic, ev...","[resident, asked, shelter, place, notified, of..."
3,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...,"[people, receive, wildfires, evacuation, order...","[peopl, receiv, wildfir, evacu, order, califor...","[people, receive, wildfire, evacuation, order,..."
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...,"[got, sent, photo, ruby, alaska, smoke, wildfi...","[got, sent, photo, rubi, alaska, smoke, wildfi...","[got, sent, photo, ruby, alaska, smoke, wildfi..."


#### Part of Speech (POS) tagging and chunking

In [30]:
def word_pos_tagger(text):
    pos_tagged_text = nltk.pos_tag(text)
    return pos_tagged_text

In [31]:
data_clean['text_tokens_pos_tagged'] = data_clean['text_tokens'].apply(lambda x: word_pos_tagger(x))

In [32]:
data_clean.head()

Unnamed: 0,text,target,text_clean,text_tokens,text_tokens_stem,text_tokens_lemma,text_tokens_pos_tagged
0,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us,"[deeds, reason, earthquake, may, allah, forgiv...","[deed, reason, earthquak, may, allah, forgiv, us]","[deed, reason, earthquake, may, allah, forgive...","[(deeds, NNS), (reason, NN), (earthquake, NN),..."
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, rong, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[(forest, JJS), (fire, NN), (near, IN), (la, J..."
2,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...,"[residents, asked, shelter, place, notified, o...","[resid, ask, shelter, place, notifi, offic, ev...","[resident, asked, shelter, place, notified, of...","[(residents, NNS), (asked, VBD), (shelter, JJ)..."
3,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...,"[people, receive, wildfires, evacuation, order...","[peopl, receiv, wildfir, evacu, order, califor...","[people, receive, wildfire, evacuation, order,...","[(people, NNS), (receive, VBP), (wildfires, NN..."
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...,"[got, sent, photo, ruby, alaska, smoke, wildfi...","[got, sent, photo, rubi, alaska, smoke, wildfi...","[got, sent, photo, ruby, alaska, smoke, wildfi...","[(got, VBD), (sent, JJ), (photo, NN), (ruby, N..."
