In [1]:
import pandas as pd 
import numpy as np 

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.impute import SimpleImputer

import warnings
warnings.simplefilter('ignore')

# Train and Validation

In [2]:
df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.3,how='all',axis=1, inplace = True) #remove de variables with more than 30% of NA values

X = df.drop(['target', 'id'], 1)
y = df['target']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = numerical_cols

X = X[my_cols]

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(Xtrain)

aux = imputer.transform(Xtrain)
Xtrain_pp = pd.DataFrame(aux, columns = Xtrain.columns)

aux = imputer.transform(Xval)
Xval_pp = pd.DataFrame(aux, columns = Xval.columns)

model = KNeighborsRegressor()
model.fit(Xtrain_pp, ytrain)

p = model.predict(Xval_pp)

print(f'RMSE: {np.sqrt(mean_squared_error(yval, p))}')

RMSE: 7.06908912918437


# Test and Submission

In [4]:
teste = pd.read_csv("SUPCOM_Test.csv")
teste = teste[my_cols]

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X)

aux = imputer.transform(X)
X_pp = pd.DataFrame(aux, columns = X.columns)

aux = imputer.transform(teste)
teste_pp = pd.DataFrame(aux, columns = teste.columns)

model = KNeighborsRegressor()
model.fit(X_pp, y)

p = model.predict(teste_pp)

teste = pd.read_csv("SUPCOM_Test.csv")

d = {'id': teste['id'], 'target': p} 

predictions = pd.DataFrame(d)
predictions.to_csv("submission.csv", index=False)

Zindi Result: 5.774962714978422

# RandomizedSearchCV to Select the Best Hyperparameters

In [12]:
from sklearn.model_selection import RandomizedSearchCV

n_neighbors = [14, 17, 20, 24, 27, 30, 33, 36]
weights = [ 'uniform', 'distance']
algorithm = [ 'auto', 'ball_tree', 'kd_tree', 'brute']

# Create the random grid
random_grid = {'n_neighbors': n_neighbors,
               'weights': weights,
               'algorithm' : algorithm}


In [13]:
df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.3,how='all',axis=1, inplace = True) #remove de variables with more than 40% of NA values

X = df.drop(['target', 'id'], 1)
y = df['target']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = numerical_cols

X = X[my_cols]

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(Xtrain)

aux = imputer.transform(Xtrain)
Xtrain_pp = pd.DataFrame(aux, columns = Xtrain.columns)

aux = imputer.transform(Xval)
Xval_pp = pd.DataFrame(aux, columns = Xval.columns)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
knn_model = KNeighborsRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
knn_random = RandomizedSearchCV(estimator = knn_model, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
knn_random.fit(Xtrain_pp, ytrain)

knn_random.best_params_

Fitting 3 folds for each of 64 candidates, totalling 192 fits


{'weights': 'distance', 'n_neighbors': 33, 'algorithm': 'auto'}

In [17]:
teste = pd.read_csv("SUPCOM_Test.csv")
teste = teste[my_cols]

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X)

aux = imputer.transform(X)
X_pp = pd.DataFrame(aux, columns = X.columns)

aux = imputer.transform(teste)
teste_pp = pd.DataFrame(aux, columns = teste.columns)

model = KNeighborsRegressor(weights = 'distance', n_neighbors = 33, algorithm = 'auto')
model.fit(X_pp, y)

p = model.predict(teste_pp)

teste = pd.read_csv("SUPCOM_Test.csv")

d = {'id': teste['id'], 'target': p} 

predictions = pd.DataFrame(d)
predictions.to_csv("submission.csv", index=False)

Zindi Result: 5.651883353585753