# Features 

En este notebook se irán generando distintos features que luego deberán ser probados.

In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df['id'] = df['id'].astype(np.uint16)
df['target'] = df['target'].astype(np.uint8)
df = df.fillna("")

## Features a partir de la keyword 

#### One Hot Encoding de las keywords 

In [4]:
ohe_keyword = pd.get_dummies(df["keyword"])

In [5]:
nuevas_col = pd.Series()
for col in ohe_keyword.columns:
    nuevas_col = nuevas_col.append(pd.Series("ohe_keyword_" + col))
ohe_keyword.columns = nuevas_col

In [6]:
df = pd.concat(objs=[df, ohe_keyword], axis=1)

#### Count Encoding de las keywords 

In [7]:
keywords = df["keyword"].value_counts()

In [8]:
df["keyword_count"] = df["keyword"].transform(lambda x: keywords[x])

Versión normalizada:

In [9]:
df["keyword_count_norm"] = df["keyword_count"]/keywords.max()

#### Mean Encoding de las keywords 

In [10]:
keywords_target = df.groupby(["keyword", "target"]).count()["id"]

In [None]:
def keyword_mean(x):
    try:
        targets = keywords_target[x]
    except KeyError:
        return 0.5
    if len(targets) == 2:
        return targets[0]/(targets[0]+targets[1])
    else:
        try:
            return targets[0]/targets[0]
        except KeyError:
            return 0

In [11]:
df["keyword_mean"] = df["keyword"].transform(keyword_mean)

##### Feature que indica si la keyword está o no en el tweet: 

In [13]:
df["keyword_en_tweet"] = df.agg(lambda x: 1 if x["keyword"].lower() in x["text"].lower().split() else 0, axis=1)

## Features a partir de la location 

#### One Hot Encoding de las location

In [14]:
ohe_location = pd.get_dummies(df["location"])

In [15]:
nuevas_col = pd.Series()
for col in ohe_location.columns:
    nuevas_col = nuevas_col.append(pd.Series("ohe_location_" + col))
ohe_location.columns = nuevas_col

In [16]:
df = pd.concat(objs=[df, ohe_location], axis=1)

#### Count Encoding de las location

In [17]:
locations = df["location"].value_counts()

In [18]:
df["location_count"] = df["location"].transform(lambda x: locations[x])

Versión normalizada:

In [19]:
df["location_count_norm"] = df["location_count"]/locations.max()

#### Mean Encoding de las location

In [20]:
locations_target = df.groupby(["location", "target"]).count()["id"]

In [None]:
def location_mean(x):
    try:
        targets = locations_target[x]
    except KeyError:
        return 0.5
    if len(targets) == 2:
        return targets[0]/(targets[0]+targets[1])
    else:
        try:
            return targets[0]/targets[0]
        except KeyError:
            return 0

In [21]:
df["location_mean"] = df["location"].transform(location_mean)

## Features a partir del tweet 

##### Feature que indica la longitud del tweet:

In [23]:
df["long"] = df["text"].transform(lambda x: len(x))

Versión normalizada:

In [24]:
df["long_norm"] = df["long"]/df["long"].max()

##### Feature que indica la cantidad de términos en el tweet:

In [25]:
df["nro_term"] = df["text"].transform(lambda x: len(x.split()))

Versión normalizada:

In [26]:
df["nro_term_norm"] = df["nro_term"]/df["nro_term"].max()

##### Feature que indica si el tweet contiene una URL:

In [27]:
def hay_url(x):
    if re.search('https{0,1}:\/\/\S*', x) is not None:
        return 1
    else:
        return 0
    
df["hay_url"] = df["text"].transform(hay_url)

##### Feature que indica si en el tweet hay algún número o no:

In [28]:
def hay_nros(x):
    x = x.split()
    for i in x:
        i = i.replace(',','')
        try:
            float(i)
            return 1
        except ValueError:
            continue
    return 0 

df["hay_nros"] = df["text"].transform(hay_nros)

##### Feature que indica si el tweet contiene una mención:

In [29]:
def hay_mencion(x):
    for i in x.split():
        if i[0]=='@':
            return 1
    return 0

df["hay_mencion"] = df["text"].transform(hay_mencion)

##### Feature que indica si el tweet contiene un hashtag:

In [30]:
def hay_hashtag(x):
    for i in x.split():
        if i[0]=='#':
            return 1
    return 0

df["hay_hashtag"] = df["text"].transform(hay_hashtag)

#### Modelo BOW

In [31]:
# Función para limpiar el texto de los mensajes.
def clean_text(text):
    # Se convierte el texto a minúsculas.
    text = text.lower()
    # Se quitan los '#'.
    text = re.sub('#', '', text)
    # Se quitan los números.
    text = re.sub('\w*\d\w*', '', text)
    # Se quitan los saltos de línea.
    text = re.sub('\n', ' ', text)
    # Se eliminan las referencias a usuarios '@user'.
    text = re.sub('@\S*', '', text)
    # Se quitan vínculos URL.
    text = re.sub('https{0,1}:\/\/\S*', ' ', text)
    # Se simplifican múltiples espacios a uno solo.
    text = re.sub('(\ ){2,7}', ' ',text)
    # Se quitan los signos de puntuación.
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

In [32]:
c_vect = CountVectorizer(stop_words='english', preprocessor=clean_text, max_df=0.5, min_df=5)

In [33]:
bow_cols = pd.DataFrame(c_vect.fit_transform(df["text"]).toarray())

In [34]:
nuevas_col = pd.Series()
for col in bow_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("bow_" + str(col)))
bow_cols.columns = nuevas_col

In [35]:
df = pd.concat(objs=[df, bow_cols], axis=1)

#### Feature Hashing de 101 columnas para el tweet con modelo BOW 

In [36]:
h_vect = HashingVectorizer(stop_words='english', preprocessor=clean_text, n_features=101, norm=None)

In [37]:
fh_cols = pd.DataFrame(h_vect.fit_transform(df["text"]).toarray())

In [38]:
nuevas_col = pd.Series()
for col in fh_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("fh101_" + str(col)))
fh_cols.columns = nuevas_col

In [39]:
df = pd.concat(objs=[df, fh_cols], axis=1)

#### Modelo TF-IDF

In [40]:
tfidf_vect = TfidfVectorizer(stop_words='english', preprocessor=clean_text, max_df=0.5, min_df=5)

In [41]:
tfidf_cols = pd.DataFrame(tfidf_vect.fit_transform(df["text"]).toarray())

In [42]:
nuevas_col = pd.Series()
for col in tfidf_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("tfidf_" + str(col)))
tfidf_cols.columns = nuevas_col

In [43]:
df = pd.concat(objs=[df, tfidf_cols], axis=1)

#### Feature Hashing de 1001 columnas para el tweet con modelo BOW con trigramas

In [44]:
h_vect_3g = HashingVectorizer(stop_words='english', preprocessor=clean_text, ngram_range=(1,3), n_features=1001, norm=None)

In [45]:
fh3g_cols = pd.DataFrame(h_vect_3g.fit_transform(df["text"]).toarray())

In [46]:
nuevas_col = pd.Series()
for col in fh3g_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("fh3g1001_" + str(col)))
fh3g_cols.columns = nuevas_col

In [47]:
df = pd.concat(objs=[df, fh3g_cols], axis=1)

In [48]:
#df.to_csv("train_proc.csv", index=False)