In [8]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [4]:
from google.colab import files

In [7]:
file = files.upload()
df = pd.read_csv("train.csv")

Saving train.csv to train.csv


In [None]:
df['id'] = df['id'].astype(np.uint16)
df['target'] = df['target'].astype(np.uint8)
df = df.fillna("")

In [None]:
keywords_target = df.groupby(["keyword", "target"]).count()["id"]

In [None]:
df["keyword_mean"] = df.agg(lambda x: keywords_target[x["keyword"]][x["target"]], axis=1)

In [None]:
df["long"] = df["text"].transform(lambda x: len(x))

In [None]:
df["nro_term"] = df["text"].transform(lambda x: len(x.split()))

In [None]:
df["keyword_mean_norm"] = df.agg(lambda x: x["keyword_mean"]/df[df["target"]==x["target"]]["keyword_mean"].max(), axis=1)

In [None]:
df["long_norm"] = df["long"]/df["long"].max()

In [None]:
df["nro_term_norm"] = df["nro_term"]/df["nro_term"].max()

# KNN

In [None]:
knn = KNeighborsClassifier(n_jobs=-1)

In [None]:
df_f = df[["keyword_mean","long", "nro_term"]]

In [None]:
cross_val_score(knn, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.59947472, 0.59553513, 0.60472751, 0.63140604, 0.64586071])

In [None]:
gridcv = GridSearchCV(knn, {"n_neighbors":[5,9,13,17,21], "weights":["uniform", "distance"], "p":[0.5,1,2]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [5, 9, 13, 17, 21], 'p': [0.5, 1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'n_neighbors': 21, 'p': 1, 'weights': 'distance'}

In [None]:
knn = KNeighborsClassifier(weights="distance", n_jobs=-1)

In [None]:
gridcv = GridSearchCV(knn, {"n_neighbors":[21, 27, 31, 37], "p":[0.9,1,1.1]}, scoring="accuracy", n_jobs=-1)
gridcv.fit(df_f, y=df["target"])

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            n_neighbors=5, p=2,
                                            weights='distance'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [21, 27, 31, 37], 'p': [0.9, 1, 1.1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
gridcv.best_params_

{'n_neighbors': 21, 'p': 1}

In [None]:
gridcv.best_score_

0.6207827762309502

In [None]:
knn = KNeighborsClassifier(n_neighbors=21, weights="distance", p=1, n_jobs=-1)

In [None]:
cross_val_score(knn, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.62902167, 0.61391989, 0.60013132, 0.6346912 , 0.6261498 ])

Ahora pruebo con estos tres features pero normalizados.

In [None]:
df_f = df[["keyword_mean_norm", "long_norm", "nro_term_norm"]]

In [None]:
cross_val_score(knn, df_f, y=df["target"], n_jobs=-1, scoring="accuracy")

array([0.69008536, 0.67892318, 0.6651346 , 0.72273325, 0.66622865])

# Experimental

In [None]:
df["keyword_mean_norm_?"] = df.agg(lambda x: df[df["target"]==x["target"]]["keyword_mean"].max(), axis=1)

In [None]:
df_x = df[["keyword_mean_norm_?", "long_norm", "nro_term_norm"]]

In [None]:
knn.fit(df_x, df["target"])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=21, p=1,
                     weights='distance')

In [None]:
file = files.upload()
test = pd.read_csv("test.csv")

Saving test.csv to test.csv


In [None]:
test = test.fillna("")

In [None]:
test["long"] = test["text"].transform(lambda x: len(x))

In [None]:
test["nro_term"] = test["text"].transform(lambda x: len(x.split()))

In [None]:
test["kmn"] = test.agg(lambda x: df[df["keyword"]==x["keyword"]]["keyword_mean_norm_?"].iloc[0], axis=1)

In [None]:
test["ln"] = test["long"]/test["long"].max()

In [None]:
test["ntn"] = test["nro_term"]/test["nro_term"].max()

In [None]:
test_x = test[["kmn", "ln", "ntn"]]

In [None]:
res = knn.predict(test_x)

In [None]:
test["target"] = pd.Series(res)

In [None]:
test[["id", "target"]].to_csv("prueba.csv", index=False)

In [None]:
files.download("prueba.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Fin de experimento

Probar KNN con los mejores tres features que se encuentren.

# Decision Tree