In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import matplotlib.pylab as plt

%matplotlib inline

In [2]:
df_features = pd.read_csv('../features/features_basicas_texto.csv', index_col='id')
df_train = pd.read_csv('../dataset/train.csv', index_col='id')
df_test = pd.read_csv('../dataset/test.csv', index_col='id')

In [3]:
features_cols = df_features.columns.tolist()
df_train = df_train.join(df_features, how='inner')

X = df_train[features_cols]
Y = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=17)

In [4]:
param_grid = { 
    'n_estimators': [x for x in range(100, 500, 100)],
    'max_depth': [3, 5, 10],
    'colsample_bytree': [0.5, 0.7, 0.9]
}

In [5]:
xgb_classifier = xgb.XGBClassifier(random_state=17, learning_rate=0.05)
CV_xgb = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, scoring='f1', n_jobs=4)
CV_xgb.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.05, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=17, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=4,
             param_grid={'colsample_bytree': [0.5, 0.7, 0.9],
                         'max_depth': [3, 5, 10],
                         'n_estimators': [100, 200, 300,

In [6]:
CV_xgb.best_params_

{'colsample_bytree': 0.7, 'max_depth': 5, 'n_estimators': 100}

In [7]:
def try_classifier(classifier):
    predictions = classifier.predict(X_test)
    predictions_with_prob = classifier.predict_proba(X_test)[:,1]
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print(f'Verdaderos Negativos: {tn}')
    print(f'Falsos Negativos: {fn}')
    print(f'Verdaderos Positivos: {tp}')
    print(f'Falsos Positivos: {fp}')
    print()
    print(f'f1 score: {f1_score(y_test,  predictions)}')

In [8]:
try_classifier(CV_xgb)

Verdaderos Negativos: 648
Falsos Negativos: 316
Verdaderos Positivos: 351
Falsos Positivos: 208

f1 score: 0.5725938009787929


In [9]:
df_submit = df_test.join(df_features, how='inner')[features_cols]
df_submit.shape

(3263, 12)

In [10]:
kaggle_pred = CV_xgb.predict(df_submit)

In [11]:
df_submit['target'] = kaggle_pred

In [12]:
df_submit

Unnamed: 0_level_0,word_count,unique_words_count,word_count_above_mean,unique_words_count_above_mean,char_count,char_count_above_mean,avg_word_length,subjectivity,polarity,neg,neu,pos,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,4,4,0,0,24,0,6.000000,1.000000,-0.7003,1,0,0,0
2,7,7,0,0,46,0,6.571429,0.550000,0.4404,0,0,1,0
3,10,10,1,1,52,1,5.200000,0.000000,-0.6159,1,0,0,1
9,4,4,0,0,34,0,8.500000,0.000000,0.0000,0,1,0,0
11,6,6,0,0,33,0,5.500000,0.000000,-0.5423,1,0,0,0
12,3,3,0,0,24,0,8.000000,0.000000,0.0000,0,1,0,0
21,9,8,0,0,46,0,5.111111,0.000000,0.0000,0,1,0,1
22,2,2,0,0,6,0,3.000000,0.000000,0.0000,0,1,0,0
27,2,2,0,0,7,0,3.500000,1.000000,0.4215,0,0,1,0
29,2,2,0,0,7,0,3.500000,0.600000,-0.5423,1,0,0,0


In [112]:
df_submit[['target']].to_csv('../submits/submit_xgboost_basic.csv')