In [1]:
import pandas as pd 
import numpy as np 

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.impute import SimpleImputer

import warnings
warnings.simplefilter('ignore')

# Train and Validation

In [2]:
df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.3,how='all',axis=1, inplace = True) #remove de variables with more than 40% of NA values

X = df.drop(['target', 'id'], 1)
y = df['target']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = numerical_cols

X = X[my_cols]

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(Xtrain)

aux = imputer.transform(Xtrain)
Xtrain_pp = pd.DataFrame(aux, columns = Xtrain.columns)

aux = imputer.transform(Xval)
Xval_pp = pd.DataFrame(aux, columns = Xval.columns)

model = RandomForestRegressor()
model.fit(Xtrain_pp, ytrain)

p = model.predict(Xval_pp)

print(f'RMSE: {np.sqrt(mean_squared_error(yval, p))}')

  X = df.drop(['target', 'id'], 1)


RMSE: 5.938993135209518


# Test and Submission

In [4]:
teste = pd.read_csv("SUPCOM_Test.csv")
teste = teste[my_cols]

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X)

aux = imputer.transform(X)
X_pp = pd.DataFrame(aux, columns = X.columns)

aux = imputer.transform(teste)
teste_pp = pd.DataFrame(aux, columns = teste.columns)

model = RandomForestRegressor()
model.fit(X_pp, y)

p = model.predict(teste_pp)

teste = pd.read_csv("SUPCOM_Test.csv")

d = {'id': teste['id'], 'target': p} 

predictions = pd.DataFrame(d)
predictions.to_csv("submission.csv", index=False)

Zindi Result: 5.640048260938344

# RandomizedSearchCV to Select the Best Hyperparameters

In [2]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [3]:
df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.3,how='all',axis=1, inplace = True) #remove de variables with more than 40% of NA values

X = df.drop(['target', 'id'], 1)
y = df['target']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = numerical_cols

X = X[my_cols]

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(Xtrain)

aux = imputer.transform(Xtrain)
Xtrain_pp = pd.DataFrame(aux, columns = Xtrain.columns)

aux = imputer.transform(Xval)
Xval_pp = pd.DataFrame(aux, columns = Xval.columns)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(Xtrain_pp, ytrain)

rf_random.best_params_

  X = df.drop(['target', 'id'], 1)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [4]:
teste = pd.read_csv("SUPCOM_Test.csv")
teste = teste[my_cols]

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X)

aux = imputer.transform(X)
X_pp = pd.DataFrame(aux, columns = X.columns)

aux = imputer.transform(teste)
teste_pp = pd.DataFrame(aux, columns = teste.columns)

model = RandomForestRegressor(n_estimators=400, min_samples_split=2, min_samples_leaf=1, 
                              max_features='sqrt', max_depth=None, bootstrap=False)
model.fit(X_pp, y)

p = model.predict(teste_pp)

teste = pd.read_csv("SUPCOM_Test.csv")

d = {'id': teste['id'], 'target': p} 

predictions = pd.DataFrame(d)
predictions.to_csv("submission.csv", index=False)

Zindi Result: 5.607478488705707