In [96]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import matplotlib.pylab as plt

%matplotlib inline

In [97]:
df_features = pd.read_csv('../features/features_basicas_texto.csv', index_col='id')
df_train = pd.read_csv('../dataset/train.csv', index_col='id')
df_test = pd.read_csv('../dataset/test.csv', index_col='id')

In [98]:
features_cols = df_features.columns.tolist()
df_train = df_train.join(df_features, how='inner')

X = df_train[features_cols]
Y = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=17)

In [99]:
param_grid = { 
    'n_estimators': [x for x in range(100, 500, 100)],
    'max_depth': [3, 5, 10],
    'colsample_bytree': [0.5, 0.7, 0.9]
}

In [100]:
xgb_classifier = xgb.XGBClassifier(random_state=17, learning_rate=0.05)
CV_xgb = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, scoring='f1', n_jobs=4)
CV_xgb.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.05, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constrai...
                                     objective='binary:logistic',
                                     random_state=17, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     

In [101]:
CV_xgb.best_params_

{'colsample_bytree': 0.7, 'max_depth': 5, 'n_estimators': 400}

In [102]:
def try_classifier(classifier):
    predictions = classifier.predict(X_test)
    predictions_with_prob = classifier.predict_proba(X_test)[:,1]
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print(f'Verdaderos Negativos: {tn}')
    print(f'Falsos Negativos: {fn}')
    print(f'Verdaderos Positivos: {tp}')
    print(f'Falsos Positivos: {fp}')
    print()
    print(f'f1 score: {f1_score(y_test,  predictions)}')

In [103]:
try_classifier(CV_xgb)

Verdaderos Negativos: 639
Falsos Negativos: 310
Verdaderos Positivos: 357
Falsos Positivos: 217

f1 score: 0.5753424657534246


In [106]:
df_submit = df_test.join(df_features, how='inner')[features_cols]
df_submit.shape

(3263, 12)

In [107]:
kaggle_pred = CV_xgb.predict(df_submit)

In [110]:
df_submit['target'] = kaggle_pred

In [112]:
df_submit[['target']].to_csv('../submits/submit_xgboost_basic.csv')