In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

In [10]:
train = pd.read_csv('data/train_pre_processing_3.csv')
test = pd.read_csv('data/test_pre_processing_3.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 47 columns):
keyword                              7552 non-null object
location                             5080 non-null object
text                                 7613 non-null object
keyword_grouped                      7552 non-null object
text_contain_keyword                 7613 non-null bool
total_words                          7613 non-null int64
total_upper_chars                    7613 non-null int64
total_numbers_chars                  7613 non-null int64
total_special_chars                  7613 non-null int64
contain_question                     7613 non-null bool
contain_link                         7613 non-null bool
contain_hashtag                      7613 non-null bool
contain_upper_words                  7613 non-null bool
total_3_words                        7613 non-null int64
total_5_words                        7613 non-null int64
total_6_words                        76

In [19]:
def prepare_df_for_fit(df):
    columns_str = ['keyword', 'location', 'text', 'keyword_grouped']

    # Encode with LabelEncoder
    encoded_cols = df[columns_str]
    encoded_cols = encoded_cols.astype('str')
    encoded_cols = encoded_cols.apply(LabelEncoder().fit_transform)
    encoded_drop = df.drop(columns_str, axis = 1)
    encoded_df = pd.concat([encoded_drop, encoded_cols], axis = 1)
    # Drop Target column
    if 'target' in encoded_df.columns:
        encoded_df.drop(axis=1, labels=['target'], inplace=True)

    return encoded_df

In [21]:
train_X = prepare_df_for_fit(train)
test_X = prepare_df_for_fit(test)
train_Y = train['target']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y, test_size = 0.25, random_state = 10)

In [24]:
xgb_model = xgb.XGBClassifier(objective="reg:linear", random_state=10)

xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=10,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [25]:
preds = xgb_model.predict(X_test)

In [28]:
roc_auc_score(y_test, preds)

0.7669233182963819

In [29]:
acc = accuracy_score(preds,y_test)
print("ACC: %f" % (acc))

ACC: 0.779937


## GridSearchCV (Objective)

In [36]:
parameters = [{'objective': ['reg:linear']},
              {'objective': ['binary:logistic']}]
grid_search = GridSearchCV(estimator = xgb_model,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

In [39]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.7940094587493431
{'objective': 'reg:linear'}


## GridSearchCV (max_depth & min_child_weight)

In [43]:
parameters = [{
 'max_depth':range(3,10,1),
 'min_child_weight':range(1,6,1)
}]

grid_search = GridSearchCV(estimator = xgb_model,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

In [44]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.7954107549483272
{'max_depth': 5, 'min_child_weight': 1}


## GridSearchCV (min_child_weight)

In [72]:
# Probando con parametros en el rango de los valores tipicos 
# para lograr un min_child_weight adaptable
parameters = [{
    'learning_rate': [0.04], 
    'max_depth': [5], 
    'n_estimators': [300], 
    'min_child_weight': range(1,6,1), 
    'gamma': [0.3], 
    'subsample': [0.9], 
    'colsample_bytree': [0.5], 
    'reg_alpha': [0.1], 
    'seed': [123]
}]

grid_search = GridSearchCV(estimator = xgb_model,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

In [73]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.7996146435452793
{'colsample_bytree': 0.5, 'gamma': 0.3, 'learning_rate': 0.04, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 300, 'reg_alpha': 0.1, 'seed': 123, 'subsample': 0.9}


## GridSearchCV (gamma)

In [75]:
parameters = [{
    'gamma': np.arange(0,1,0.05),
    'max_depth': [grid_search.best_params_.get('max_depth')],
    'min_child_weight': [grid_search.best_params_.get('min_child_weight')],
    'learning_rate': [0.04],
    'n_estimators': [300],
    'subsample': [0.9], 
    'colsample_bytree': [0.5], 
    'reg_alpha': [0.1], 
    'seed': [123]
}]

grid_search = GridSearchCV(estimator = xgb_model,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

In [76]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.8003152916447714
{'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.04, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 300, 'reg_alpha': 0.1, 'seed': 123, 'subsample': 0.9}


## GridSearchCV (n_estimators)

In [77]:
parameters = [{
    'gamma': [grid_search.best_params_.get('gamma')],
    'max_depth': [grid_search.best_params_.get('max_depth')],
    'min_child_weight': [grid_search.best_params_.get('min_child_weight')],
    'learning_rate': [0.04],
    'n_estimators': range(100,1000,50),
    'subsample': [0.9], 
    'colsample_bytree': [0.5], 
    'reg_alpha': [0.1], 
    'seed': [123]
}]

grid_search = GridSearchCV(estimator = xgb_model,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

In [78]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.8003152916447714
{'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.04, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 300, 'reg_alpha': 0.1, 'seed': 123, 'subsample': 0.9}


## GridSearchCV (colsample_bytree + subsample)

In [79]:
parameters = [{
    'colsample_bytree': np.arange(0.5, 1, 0.1),
    'subsample': np.arange(0.5, 1, 0.1),
    'learning_rate': [0.04],
    'n_stimators': [grid_search.best_params_.get('n_estimators')],
    'gamma': [grid_search.best_params_.get('gamma')],
    'max_depth': [grid_search.best_params_.get('max_depth')],
    'min_child_weight': [grid_search.best_params_.get('min_child_weight')],
    'reg_alpha': [0.1],
    'seed': [123]
}]

grid_search = GridSearchCV(estimator = xgb_model,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

In [80]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.7968120511473112
{'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.04, 'max_depth': 5, 'min_child_weight': 2, 'n_stimators': 300, 'reg_alpha': 0.1, 'seed': 123, 'subsample': 0.8999999999999999}


## GridSearchCV (learning_rate)

In [85]:
parameters = [{
    'colsample_bytree': [grid_search.best_params_.get('colsample_bytree')],
    'subsample': [grid_search.best_params_.get('subsample')],
    'learning_rate': np.arange(0,0.1,0.01),
    'n_stimators': [grid_search.best_params_.get('n_estimators')],
    'gamma': [grid_search.best_params_.get('gamma')],
    'max_depth': [grid_search.best_params_.get('max_depth')],
    'min_child_weight': [grid_search.best_params_.get('min_child_weight')],
    'reg_alpha': [0.1],
    'seed': [123]
}]

grid_search = GridSearchCV(estimator = xgb_model,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

In [86]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.7973375372219302
{'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.06, 'max_depth': 5, 'min_child_weight': 2, 'n_stimators': None, 'reg_alpha': 0.1, 'seed': 123, 'subsample': 0.8999999999999999}


## Multiple GridSearchCV

In [None]:
parameters = [{
    'colsample_bytree': np.arange(0.5, 1, 0.1),
    'subsample': np.arange(0.5, 1, 0.1),
    'learning_rate': np.arange(0,0.1,0.01),
    'n_stimators': range(100,700,50),
    'gamma': np.arange(0,1,0.05),
    'max_depth': [grid_search.best_params_.get('max_depth')],
    'min_child_weight': [grid_search.best_params_.get('min_child_weight')],
    'reg_alpha': [0.1],
    'seed': [123]
}]

grid_search = GridSearchCV(estimator = xgb_model,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

In [None]:
print(grid_search.best_score_)
print(grid_search.best_params_)