In [108]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import matplotlib.pylab as plt

%matplotlib inline

Creacion de los sets de tests y features

In [109]:
df_palabras = pd.read_csv('../features/palabras_clave.csv')
df_word2vec = pd.read_csv('../features/word2vec.csv')
df_features = df_palabras.join(df_word2vec,how = 'inner')

df_train = pd.read_csv('../dataset/train.csv', index_col='id')
df_test = pd.read_csv('../dataset/test.csv', index_col='id')

In [110]:
features_cols = df_features.columns.tolist()
df_train = df_train.join(df_features, how='inner')

X = df_train[features_cols]
Y = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=17)

In [111]:
param_grid = { 
    'n_estimators': [x for x in range(100, 500, 100)],
    'max_depth': [3, 5, 10],
    'colsample_bytree': [0.5, 0.7, 0.9]
}

In [112]:
xgb_classifier = xgb.XGBClassifier(random_state=17, learning_rate=0.05)
CV_xgb = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, scoring='f1', n_jobs=4)
CV_xgb.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.05, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=17, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=4,
             param_grid={'colsample_bytree': [0.5, 0.7, 0.9],
                         'max_depth': [3, 5, 10],
                         'n_estimators': [100, 200, 300,

In [113]:
CV_xgb.best_params_

{'colsample_bytree': 0.9, 'max_depth': 10, 'n_estimators': 400}

In [114]:
def try_classifier(classifier):
    predictions = classifier.predict(X_test)
    predictions_with_prob = classifier.predict_proba(X_test)[:,1]
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print(f'Verdaderos Negativos: {tn}')
    print(f'Falsos Negativos: {fn}')
    print(f'Verdaderos Positivos: {tp}')
    print(f'Falsos Positivos: {fp}')
    print()
    print(f'f1 score: {f1_score(y_test,  predictions)}')

In [115]:
try_classifier(CV_xgb)

Verdaderos Negativos: 621
Falsos Negativos: 458
Verdaderos Positivos: 209
Falsos Positivos: 235

f1 score: 0.37623762376237624


In [116]:
df_submit = df_test.join(df_features, how='inner')[features_cols]
df_submit.shape

(3263, 10)

In [117]:
kaggle_pred = CV_xgb.predict(df_submit)

In [118]:
df_submit['target'] = kaggle_pred

In [119]:
df_submit.reset_index(inplace = True)
df_submit.rename(columns={'index':'id'}, inplace=True)
df_submit =df_submit[['id','target']]
df_submit

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
5,12,0
6,21,1
7,22,0
8,27,1
9,29,1


In [120]:
df_submit[['id', 'target']].to_csv('../submits/submit_word2vec.csv')

In [121]:
df =  pd.read_csv('../submits/xgboost_tfidf.csv')
df.columns

Index(['id', 'target'], dtype='object')