In [1]:
import pandas as pd 
import numpy as np 

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.impute import SimpleImputer

import warnings
warnings.simplefilter('ignore')

# Train and Validation

In [2]:
df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.3,how='all',axis=1, inplace = True) #remove de variables with more than 40% of NA values

X = df.drop(['target', 'id'], 1)
y = df['target']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = numerical_cols

X = X[my_cols]

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(Xtrain)

aux = imputer.transform(Xtrain)
Xtrain_pp = pd.DataFrame(aux, columns = Xtrain.columns)

aux = imputer.transform(Xval)
Xval_pp = pd.DataFrame(aux, columns = Xval.columns)

model = xgb.XGBRegressor()
model.fit(Xtrain_pp, ytrain)

p = model.predict(Xval_pp)

print(f'RMSE: {np.sqrt(mean_squared_error(yval, p))}')

RMSE: 6.100930565028387


# Test and Submission

In [3]:
teste = pd.read_csv("SUPCOM_Test.csv")
teste = teste[my_cols]

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X)

aux = imputer.transform(X)
X_pp = pd.DataFrame(aux, columns = X.columns)

aux = imputer.transform(teste)
teste_pp = pd.DataFrame(aux, columns = teste.columns)

model = xgb.XGBRegressor()
model.fit(X_pp, y)

p = model.predict(teste_pp)

teste = pd.read_csv("SUPCOM_Test.csv")

d = {'id': teste['id'], 'target': p} 

predictions = pd.DataFrame(d)
predictions.to_csv("submission.csv", index=False)

Zindi Result: 5.800272506813535

# RandomizedSearchCV to Select the Best Hyperparameters

In [6]:
from sklearn.model_selection import RandomizedSearchCV

learning_rate = [0.05,0.10,0.15,0.20,0.25,0.30]
max_depth = [ 3, 4, 5, 6, 8, 10, 12, 15]
min_child_weight = [ 1, 3, 5, 7 ]
gamma = [ 0.0, 0.1, 0.2 , 0.3, 0.4 ]
colsample_bytree = [ 0.3, 0.4, 0.5 , 0.7 ]

# Create the random grid
random_grid = {'learning_rate': learning_rate,
               'max_depth': max_depth,
               'min_child_weight': min_child_weight,
               'gamma': gamma,
               'colsample_bytree': colsample_bytree}

In [7]:
df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.3,how='all',axis=1, inplace = True) #remove de variables with more than 40% of NA values

X = df.drop(['target', 'id'], 1)
y = df['target']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = numerical_cols

X = X[my_cols]

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(Xtrain)

aux = imputer.transform(Xtrain)
Xtrain_pp = pd.DataFrame(aux, columns = Xtrain.columns)

aux = imputer.transform(Xval)
Xval_pp = pd.DataFrame(aux, columns = Xval.columns)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
xbg_model = xgb.XGBRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
xbg_random = RandomizedSearchCV(estimator = xbg_model, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
xbg_random.fit(Xtrain_pp, ytrain)

xbg_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'min_child_weight': 1,
 'max_depth': 15,
 'learning_rate': 0.05,
 'gamma': 0.3,
 'colsample_bytree': 0.7}

In [8]:
teste = pd.read_csv("SUPCOM_Test.csv")
teste = teste[my_cols]

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X)

aux = imputer.transform(X)
X_pp = pd.DataFrame(aux, columns = X.columns)

aux = imputer.transform(teste)
teste_pp = pd.DataFrame(aux, columns = teste.columns)

model = xgb.XGBRegressor(min_child_weight = 1, max_depth = 15, learning_rate = 0.05, 
                              gamma = 0.3, colsample_bytree = 0.7)
model.fit(X_pp, y)

p = model.predict(teste_pp)

teste = pd.read_csv("SUPCOM_Test.csv")

d = {'id': teste['id'], 'target': p} 

predictions = pd.DataFrame(d)
predictions.to_csv("submission.csv", index=False)

Zindi Result: 5.578911719930432