In [1]:
import pandas as pd
import numpy as np
import re
import string
from spellchecker import SpellChecker

In [2]:
df = pd.read_csv('train.csv', encoding='ISO-8859-1')

In [100]:
df['target'] = df['target'].astype(np.uint8)
df.set_index('id', inplace = True)

In [3]:
df.sample(10, random_state=42)

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÂÃÃ...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0
5559,7934,rainstorm,,@Calum5SOS you look like you got caught in a r...,0
1765,2538,collision,,my favorite lady came to our volunteer meeting...,1
1817,2611,crashed,,@brianroemmele UX fail of EMV - people want to...,1
6810,9756,tragedy,"Los Angeles, CA",Can't find my ariana grande shirt this is a f...,0
4398,6254,hijacking,"Athens,Greece",The Murderous Story Of AmericaÂÃÂªs First Hi...,1


In [4]:
sentence = df.loc[4398, 'text']
sentence

'The Murderous Story Of AmericaÂ\x89Ã\x9bÂªs First Hijacking http://t.co/EYUGk6byxr'

### Cosas a tener en cuenta.
- La columna `id` puede no ser indicativa de nada para la predicción. Puede ser conveniente usarlo como índice;
- Hay 7613 casos en el train, si se quitan los nulos en `keyword` solo quedan los nulos en `location` y no se superponen.

In [5]:
# Función para limpiar el texto de los mensajes.
def clean_text(text):
    # Se convierte el texto a minúsculas.
    text = text.lower()
    # Se quitan los '#'.
    text = re.sub('#', '', text)
    # Se quitan los números.
    text = re.sub('\w*\d\w*', '', text)
    # Se quitan las llaves y atributos de html.
    text = re.sub(r'<.*?>', '', text)
    # Se quitan los saltos de línea.
    text = re.sub('\n', ' ', text)
    # Se eliminan las referencias a usuarios '@user'.
    text = re.sub('@\S*', '', text)
    # Se quitan vínculos URL.
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    # Se simplifican múltiples espacios a uno solo.
    text = re.sub('(\ ){2,7}', ' ',text)
    # Se quitan los signos de puntuación.
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Se quitan los caracteres no ASCII
    text = re.sub(r'[^\x00-\x7F]+','ñ', text)
    return text

df['text'] = df['text'].apply(lambda x: clean_text(x))

#======= MUESTRA ALEATORIA =======
# Se elige un ancho para la visualización de columnas
pd.set_option('max_colwidth', 150)

display(df.sample(10, random_state=42))

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,so you have a new weapon that can cause unimaginable destruction,1
2227,3185,deluge,,the famp things i do for gishwhes just got soaked in a deluge going for pads and tampons thx,0
5448,7769,police,UK,dt rt ñthe col police can catch a pickpocket in liverpool stree,1
132,191,aftershock,,aftershock back to school kick off was great i want to thank everyone for making it possible what a great night,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma children of addicts develop a defensive self one that decreases vulnerability,0
5559,7934,rainstorm,,you look like you got caught in a rainstorm this is amazing and disgusting at the same time,0
1765,2538,collision,,my favorite lady came to our volunteer meeting hopefully joining her youth collision and i am excite,1
1817,2611,crashed,,ux fail of emv people want to insert and remove quickly like a gas pump stripe reader person told me it crashed the pos,1
6810,9756,tragedy,"Los Angeles, CA",cant find my ariana grande shirt this is a fucking tragedy,0
4398,6254,hijacking,"Athens,Greece",the murderous story of americañs first hijacking,1


### Se corrigen las palabras

In [6]:
corrector = SpellChecker()

In [7]:
# Las palabras deben estar en minúscula para que funcione
def corregir_terminos(texto):
    lista_palabras = texto.split()
    corregidos = []
    a_corregir = corrector.unknown(lista_palabras)
    
    for palabra in lista_palabras:
        if palabra in a_corregir:
            corregidos.append(corrector.correction(palabra))
        else:
            corregidos.append(palabra)
    return " ".join(corregidos)

In [None]:
df['text'] = df['text'].apply(lambda x: corregir_terminos(x))