En el siguiente notebook se probarán distintas combinaciones de features con los modelos que dieron mejores resultados con sus mejores hiperparámetros.

In [76]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import NuSVC, SVC
from sklearn.linear_model import LogisticRegression 
from itertools import combinations

In [61]:
from google.colab import files

In [102]:
file = files.upload()
df = pd.read_csv("train.csv")

Saving train.csv to train (3).csv


In [103]:
df['id'] = df['id'].astype(np.uint16)
df['target'] = df['target'].astype(np.uint8)
df = df.fillna("")

# Features

In [104]:
keywords_target = df.groupby(["keyword", "target"]).count()["id"]

In [105]:
def keyword_mean(x):
  try:
      targets = keywords_target[x]
  except KeyError:
      return 0.5
  if len(targets) == 2:
      return targets[0]/(targets[0]+targets[1])
  else:
      try:
          return targets[0]/targets[0]
      except KeyError:
          return 0

In [106]:
df["keyword_mean"] = df["keyword"].transform(keyword_mean)

In [107]:
df["keyword_en_tweet"] = df.agg(lambda x: 1 if x["keyword"].lower() in x["text"].lower().split() else 0, axis=1)

In [108]:
df["long"] = df["text"].transform(lambda x: len(x))
long_max = df["long"].max()
df["long"] = df["long"]/long_max

In [109]:
df["nro_term"] = df["text"].transform(lambda x: len(x.split()))
nro_term_max = df["nro_term"].max()
df["nro_term"] = df["nro_term"]/nro_term_max

In [110]:
def hay_url(x):
    if re.search('https{0,1}:\/\/\S*', x) is not None:
        return 1
    else:
        return 0
    
df["hay_url"] = df["text"].transform(hay_url)

In [111]:
def hay_nros(x):
    x = x.split()
    for i in x:
        i = i.replace(',','')
        try:
            float(i)
            return 1
        except ValueError:
            continue
    return 0 

df["hay_nros"] = df["text"].transform(hay_nros)

In [112]:
def hay_mencion(x):
    for i in x.split():
        if i[0]=='@':
            return 1
    return 0

df["hay_mencion"] = df["text"].transform(hay_mencion)

In [113]:
def hay_hashtag(x):
    for i in x.split():
        if i[0]=='#':
            return 1
    return 0

df["hay_hashtag"] = df["text"].transform(hay_hashtag)

In [114]:
# Función para limpiar el texto de los mensajes.
def clean_text(text):
    # Se convierte el texto a minúsculas.
    text = text.lower()
    # Se quitan los '#'.
    text = re.sub('#', '', text)
    # Se quitan los números.
    text = re.sub('\w*\d\w*', '', text)
    # Se quitan los saltos de línea.
    text = re.sub('\n', ' ', text)
    # Se eliminan las referencias a usuarios '@user'.
    text = re.sub('@\S*', '', text)
    # Se quitan vínculos URL.
    text = re.sub('https{0,1}:\/\/\S*', ' ', text)
    # Se simplifican múltiples espacios a uno solo.
    text = re.sub('(\ ){2,7}', ' ',text)
    # Se quitan los signos de puntuación.
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

In [79]:
c_vect = CountVectorizer(stop_words='english', preprocessor=clean_text, max_df=0.5, min_df=5)

bow_cols = pd.DataFrame(c_vect.fit_transform(df["text"]).toarray())

nuevas_col = pd.Series(dtype=object)
for col in bow_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("bow_" + str(col), dtype=object))
bow_cols.columns = nuevas_col

df = pd.concat(objs=[df, bow_cols], axis=1)

In [98]:
h_vect = HashingVectorizer(stop_words='english', preprocessor=clean_text, n_features=101, norm=None)

fh_cols = pd.DataFrame(h_vect.fit_transform(df["text"]).toarray())

nuevas_col = pd.Series(dtype=object)
for col in fh_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("fh101_" + str(col), dtype=object))
fh_cols.columns = nuevas_col

df = pd.concat(objs=[df, fh_cols], axis=1)

In [115]:
tfidf_vect = TfidfVectorizer(stop_words='english', preprocessor=clean_text, max_df=0.5, min_df=5)

tfidf_cols = pd.DataFrame(tfidf_vect.fit_transform(df["text"]).toarray())

nuevas_col = pd.Series(dtype=object)
for col in tfidf_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("tfidf_" + str(col), dtype=object))
tfidf_cols.columns = nuevas_col

df = pd.concat(objs=[df, tfidf_cols], axis=1)

In [None]:
h_vect_3g = HashingVectorizer(stop_words='english', preprocessor=clean_text, ngram_range=(1,3), n_features=1001, norm=None)

fh3g_cols = pd.DataFrame(h_vect_3g.fit_transform(df["text"]).toarray())

nuevas_col = pd.Series(dtype=object)
for col in fh3g_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("fh3g1001_" + str(col), dtype=object))
fh3g_cols.columns = nuevas_col

df = pd.concat(objs=[df, fh3g_cols], axis=1)

In [116]:
df_f = df.drop(labels=["id", "text", "keyword", "location", "target"], axis=1)

# KNN

In [16]:
knn = KNeighborsClassifier(n_neighbors=41, n_jobs=-1)

## Con dos features

In [121]:
score = cross_val_score(knn, df_f[["keyword_mean", "nro_term"]], y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.6640419947506562, MAX:0.8160315374507228, MEAN:0.7357207155938622


In [122]:
best_score = score.mean()
best_comb = ["keyword_mean","nro_term"]
for comb in list(combinations(df_f.iloc[:, :8].columns, 2)):
  new_scores = cross_val_score(knn, df_f[[i for i in comb]], y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
  if new_scores.mean() > best_score.mean():
    best_score = new_scores
    best_comb = comb
print("Best_comb: {}, MIN: {}, MAX:{}, MEAN:{}".format(best_comb, best_score.min(), best_score.max(), best_score.mean()))

Best_comb: ['keyword_mean', 'nro_term'], MIN: 0.7357207155938622, MAX:0.7357207155938622, MEAN:0.7357207155938622


## Con tres features

In [23]:
score = cross_val_score(knn, df_f[["keyword_mean", "long", "nro_term"]], y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.6863517060367454, MAX:0.8149606299212598, MEAN:0.7459653860613022


In [57]:
best_score = score.mean()
best_comb = ["keyword_mean","long","nro_term"]
for comb in list(combinations(df_f.iloc[:, :8].columns, 3)):
  new_scores = cross_val_score(knn, df_f[[i for i in comb]], y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
  if new_scores.mean() > best_score.mean():
    best_score = new_scores
    best_comb = comb
print("Best_comb: {}, MIN: {}, MAX:{}, MEAN:{}".format(best_comb, best_score.min(), best_score.max(), best_score.mean()))

Best_comb: ('keyword_mean', 'long', 'nro_term'), MIN: 0.6863517060367454, MAX:0.8149606299212598, MEAN:0.7459653860613022


## Con cuatro features

In [119]:
score = cross_val_score(knn, df_f[["keyword_mean", "long", "nro_term", "keyword_en_tweet"]], y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.6846254927726675, MAX:0.8107752956636005, MEAN:0.7478012768114893


In [120]:
best_score = score.mean()
best_comb = ["keyword_mean","long","nro_term","keyword_en_tweet"]
for comb in list(combinations(df_f.iloc[:, :8].columns, 4)):
  new_scores = cross_val_score(knn, df_f[[i for i in comb]], y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
  if new_scores.mean() > best_score.mean():
    best_score = new_scores
    best_comb = comb
print("Best_comb: {}, MIN: {}, MAX:{}, MEAN:{}".format(best_comb, best_score.min(), best_score.max(), best_score.mean()))

Best_comb: ('keyword_mean', 'keyword_en_tweet', 'long', 'hay_url'), MIN: 0.7034120734908137, MAX:0.8097112860892388, MEAN:0.7509543320882524


## Con cinco features

In [123]:
score = cross_val_score(knn, df_f[["keyword_mean", "long", "nro_term", "keyword_en_tweet", "hay_url"]], y=df["target"], \
                        scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.6902887139107612, MAX:0.8097112860892388, MEAN:0.7485904028750677


In [124]:
best_score = score.mean()
best_comb = ["keyword_mean","long","nro_term","keyword_en_tweet","hay_url"]
for comb in list(combinations(df_f.iloc[:, :8].columns, 5)):
  new_scores = cross_val_score(knn, df_f[[i for i in comb]], y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
  if new_scores.mean() > best_score.mean():
    best_score = new_scores
    best_comb = comb
print("Best_comb: {}, MIN: {}, MAX:{}, MEAN:{}".format(best_comb, best_score.min(), best_score.max(), best_score.mean()))

Best_comb: ('keyword_mean', 'keyword_en_tweet', 'long', 'hay_url', 'hay_nros'), MIN: 0.7069645203679369, MAX:0.8202099737532809, MEAN:0.7546319768504627


# NuSVC

In [58]:
nusvc = NuSVC(nu=0.7)

## Con BOW

In [59]:
score = cross_val_score(nusvc, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.7020997375328084, MAX:0.8160315374507228, MEAN:0.7597606064682125


## Con Feature Hashing de BOW de 101 columnas

In [101]:
score = cross_val_score(nusvc, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.6846254927726675, MAX:0.8147174770039421, MEAN:0.7428133654778042


## Con BOW más Feature Hashing de BOW de 101 columnas 

In [84]:
score = cross_val_score(nusvc, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

MIN: 0.7034120734908137, MAX:0.8160315374507228, MEAN:0.7491182688891878


## Con TF-TDF

In [118]:
score = cross_val_score(nusvc, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))



MIN: 0.699475065616798, MAX:0.8278580814717477, MEAN:0.7560798576262067


## Con Feature Hashing de BOW de unigramas hasta trigramas de 1001 columnas

In [None]:
score = cross_val_score(nusvc, df_f, y=df["target"], scoring="accuracy", cv=10, n_jobs=-1)
print("MIN: {}, MAX:{}, MEAN:{}".format(score.min(), score.max(), score.mean()))

# SVC

In [None]:
svc = SVC(C=0.15) 