In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [2]:
from google.colab import files

In [3]:
file = files.upload()
df = pd.read_csv("train.csv")

Saving train.csv to train.csv


In [4]:
df['id'] = df['id'].astype(np.uint16)
df['target'] = df['target'].astype(np.uint8)
df = df.fillna("")

In [5]:
keywords_target = df.groupby(["keyword", "target"]).count()["id"]

In [6]:
df["keyword_mean"] = df.agg(lambda x: keywords_target[x["keyword"]][x["target"]], axis=1)

In [None]:
df["keyword_mean_norm"] = df.agg(lambda x: x["keyword_mean"]/df[df["target"]==x["target"]]["keyword_mean"].max(), axis=1)

In [7]:
locations_target = df.groupby(["location", "target"]).count()["id"]

In [8]:
df["location_mean"] = df.agg(lambda x: locations_target[x["location"]][x["target"]], axis=1)

In [9]:
df["long"] = df["text"].transform(lambda x: len(x))

In [None]:
df["long_norm"] = df["long"]/df["long"].max()

In [10]:
df["nro_term"] = df["text"].transform(lambda x: len(x.split()))

In [None]:
df["nro_term_norm"] = df["nro_term"]/df["nro_term"].max()

In [11]:
def hay_url(x):
    if re.search('https{0,1}:\/\/\S*', x) is not None:
        return 1
    else:
        return 0
    
df["hay_url"] = df["text"].transform(hay_url)

In [12]:
def hay_nros(x):
    x = x.split()
    for i in x:
        i = i.replace(',','')
        try:
            float(i)
            return 1
        except ValueError:
            continue
    return 0 

df["hay_nros"] = df["text"].transform(hay_nros)

In [13]:
def hay_mencion(x):
    for i in x.split():
        if i[0]=='@':
            return 1
    return 0

df["hay_mencion"] = df["text"].transform(hay_mencion)

In [14]:
def hay_hashtag(x):
    for i in x.split():
        if i[0]=='#':
            return 1
    return 0

df["hay_hashtag"] = df["text"].transform(hay_hashtag)

In [15]:
# Función para limpiar el texto de los mensajes.
def clean_text(text):
    # Se convierte el texto a minúsculas.
    text = text.lower()
    # Se quitan los '#'.
    text = re.sub('#', '', text)
    # Se quitan los números.
    text = re.sub('\w*\d\w*', '', text)
    # Se quitan los saltos de línea.
    text = re.sub('\n', ' ', text)
    # Se eliminan las referencias a usuarios '@user'.
    text = re.sub('@\S*', '', text)
    # Se quitan vínculos URL.
    text = re.sub('https{0,1}:\/\/\S*', ' ', text)
    # Se simplifican múltiples espacios a uno solo.
    text = re.sub('(\ ){2,7}', ' ',text)
    # Se quitan los signos de puntuación.
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

In [18]:
c_vect = CountVectorizer(stop_words='english', preprocessor=clean_text, max_df=0.5, min_df=5)

bow_cols = pd.DataFrame(c_vect.fit_transform(df["text"]).toarray())

nuevas_col = pd.Series(dtype=object)
for col in bow_cols.columns:
    nuevas_col = nuevas_col.append(pd.Series("bow_" + str(col), dtype=object))
bow_cols.columns = nuevas_col

df = pd.concat(objs=[df, bow_cols], axis=1)

# KNN

In [None]:
knn = KNeighborsClassifier(n_jobs=-1)

In [None]:
df_f = df[["keyword_mean","long", "nro_term"]]

In [None]:
cross_val_score(knn, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.59947472, 0.59553513, 0.60472751, 0.63140604, 0.64586071])

In [None]:
gridcv = GridSearchCV(knn, {"n_neighbors":[5,9,13,17,21], "weights":["uniform", "distance"], "p":[0.5,1,2]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [5, 9, 13, 17, 21], 'p': [0.5, 1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'n_neighbors': 21, 'p': 1, 'weights': 'distance'}

In [None]:
knn = KNeighborsClassifier(weights="distance", n_jobs=-1)

In [None]:
gridcv = GridSearchCV(knn, {"n_neighbors":[21, 27, 31, 37], "p":[0.9,1,1.1]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            n_neighbors=5, p=2,
                                            weights='distance'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [21, 27, 31, 37], 'p': [0.9, 1, 1.1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'n_neighbors': 21, 'p': 1}

In [None]:
gridcv.best_score_

0.6207827762309502

In [None]:
knn = KNeighborsClassifier(n_neighbors=21, weights="distance", p=1, n_jobs=-1)

In [None]:
cross_val_score(knn, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.62902167, 0.61391989, 0.60013132, 0.6346912 , 0.6261498 ])

Ahora pruebo con estos tres features pero normalizados.

In [None]:
df_f = df[["keyword_mean_norm", "long_norm", "nro_term_norm"]]

In [None]:
cross_val_score(knn, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.69008536, 0.67892318, 0.6651346 , 0.72273325, 0.66622865])

Probar KNN con los mejores tres features que se encuentren.

# Para hacer un submit:

In [None]:
knn.fit(df, df["target"])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=21, p=1,
                     weights='distance')

In [None]:
file = files.upload()
df = pd.read_csv("test.csv")

Saving test.csv to test.csv


In [None]:
df_f = df[["kmn", "ln", "ntn"]]

In [None]:
res = knn.predict(df_f)

In [None]:
df_f["target"] = pd.Series(res)

In [None]:
df_f[["id", "target"]].to_csv("prueba.csv", index=False)

In [None]:
files.download("prueba.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Decision Tree

In [19]:
dt = DecisionTreeClassifier()

In [22]:
df_f = df.drop(labels=["text", "keyword", "location", "target"], axis=1)

In [23]:
cross_val_score(dt, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.71569271, 0.70059094, 0.7196323 , 0.73784494, 0.73981603])

In [24]:
gridcv = GridSearchCV(dt, {"criterion":["gini", "entropy"], "min_samples_split":[2,5,7,9], "ccp_alpha":[0,0.5,1]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'ccp_alpha': [0, 0.5, 1],
                         '

In [25]:
gridcv.best_params_

{'ccp_alpha': 0, 'criterion': 'entropy', 'min_samples_split': 2}

In [26]:
gridcv.best_score_

0.7261370332949959

In [27]:
dt = DecisionTreeClassifier(criterion="entropy")

# ExtraTree

In [28]:
et = ExtraTreeClassifier()

In [29]:
cross_val_score(et, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.68286277, 0.65331582, 0.58502955, 0.68659658, 0.70630749])

In [34]:
gridcv = GridSearchCV(et, {"criterion":["gini", "entropy"], "min_samples_split":[2,5,7,9], "ccp_alpha":[0,0.5,1]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=ExtraTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                           criterion='entropy', max_depth=None,
                                           max_features='auto',
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=7,
                                           min_weight_fraction_leaf=0.0,
                                           random_state=None,
                                           splitter='random'),
             iid='deprecated', n_jobs=-1,
             param_grid={'ccp_alpha': [0, 0.5, 1],
                         'criterion': ['gini', 'entropy'],
                         'min_samples_split': [2, 5, 7, 9]},


In [35]:
gridcv.best_params_

{'ccp_alpha': 0, 'criterion': 'entropy', 'min_samples_split': 9}

In [36]:
gridcv.best_score_

0.6917122733935978

In [37]:
et = ExtraTreeClassifier(criterion="entropy")

In [38]:
gridcv = GridSearchCV(et, {"min_samples_split":[9,11,13,15]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=ExtraTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                           criterion='entropy', max_depth=None,
                                           max_features='auto',
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=2,
                                           min_weight_fraction_leaf=0.0,
                                           random_state=None,
                                           splitter='random'),
             iid='deprecated', n_jobs=-1,
             param_grid={'min_samples_split': [9, 11, 13, 15]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy'

In [39]:
gridcv.best_params_

{'min_samples_split': 13}

In [40]:
gridcv.best_score_

0.6588812108337943

In [41]:
et = ExtraTreeClassifier(criterion="entropy", min_samples_split=13)

In [42]:
gridcv = GridSearchCV(et, {"min_samples_leaf":[1,2,5,7,9,11]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=ExtraTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                           criterion='entropy', max_depth=None,
                                           max_features='auto',
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=13,
                                           min_weight_fraction_leaf=0.0,
                                           random_state=None,
                                           splitter='random'),
             iid='deprecated', n_jobs=-1,
             param_grid={'min_samples_leaf': [1, 2, 5, 7, 9, 11]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accur

In [43]:
gridcv.best_params_

{'min_samples_leaf': 1}

In [44]:
gridcv.best_score_

0.6781893575771589

# Random Forest

In [47]:
rf = RandomForestClassifier(oob_score=True, n_jobs=-1)

In [48]:
cross_val_score(rf, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.63821405, 0.7209455 , 0.73342088, 0.77463863, 0.64454665])

In [49]:
gridcv = GridSearchCV(rf, {"n_estimators":[80,100,120], "criterion":["gini", "entropy"], "min_samples_split":[2,5,7,9], "ccp_alpha":[0,0.5,1]}, \
                      scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=True, random_state=None,
                                

In [50]:
gridcv.best_params_

{'ccp_alpha': 0,
 'criterion': 'gini',
 'min_samples_split': 2,
 'n_estimators': 120}

In [51]:
gridcv.best_score_

0.7295491901228901

In [53]:
gridcv = GridSearchCV(rf, {"n_estimators":[120,140,160]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])



GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=True, random_state=None,
                                

In [54]:
gridcv.best_params_

{'n_estimators': 160}

In [55]:
gridcv.best_score_

0.7168052196586203