Best result with Lasso(alpha = 0.0001). RMSE: 6.99776981908721

In [11]:
import pandas as pd 
import numpy as np 

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.impute import SimpleImputer

import warnings
warnings.simplefilter('ignore')

# Simple Linear Regression

## Train and Validation

In [4]:
df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.3,how='all',axis=1, inplace = True) #remove de variables with more than 40% of NA values

X = df.drop(['target', 'id'], 1)
y = df['target']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = numerical_cols

X = X[my_cols]

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(Xtrain)

aux = imputer.transform(Xtrain)
Xtrain_pp = pd.DataFrame(aux, columns = Xtrain.columns)

aux = imputer.transform(Xval)
Xval_pp = pd.DataFrame(aux, columns = Xval.columns)

model = LinearRegression()
model.fit(Xtrain_pp, ytrain)

p = model.predict(Xval_pp)

print(f'RMSE: {np.sqrt(mean_squared_error(yval, p))}')

RMSE: 7.908460999882217


## Test and Submission

In [4]:
teste = pd.read_csv("SUPCOM_Test.csv")
teste = teste[my_cols]

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X)

aux = imputer.transform(X)
X_pp = pd.DataFrame(aux, columns = X.columns)

aux = imputer.transform(teste)
teste_pp = pd.DataFrame(aux, columns = teste.columns)

model = LinearRegression()
model.fit(X_pp, y)

p = model.predict(teste_pp)

teste = pd.read_csv("SUPCOM_Test.csv")

d = {'id': teste['id'], 'target': p} 

predictions = pd.DataFrame(d)
predictions.to_csv("submission.csv", index=False)

Zindi Result: 7.004399048622439

# Regularization Tests

In [21]:
print('Ridge Regularization')

df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.3,how='all',axis=1, inplace = True) #remove de variables with more than 40% of NA values

X = df.drop(['target', 'id'], 1)
y = df['target']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = numerical_cols

X = X[my_cols]

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(Xtrain)

aux = imputer.transform(Xtrain)
Xtrain_pp = pd.DataFrame(aux, columns = Xtrain.columns)

aux = imputer.transform(Xval)
Xval_pp = pd.DataFrame(aux, columns = Xval.columns)

alphas = [0.1, 0.5, 1, 3, 5, 10, 20]
for a in alphas:
    model = Ridge(alpha = a)
    model.fit(Xtrain_pp, ytrain)

    p = model.predict(Xval_pp)

    print(f'Alpha: {a} \n RMSE: {np.sqrt(mean_squared_error(yval, p))}')

Ridge Regularization
Alpha: 0.1 
 RMSE: 58507.62115039074
Alpha: 0.5 
 RMSE: 11699.37320016342
Alpha: 1 
 RMSE: 5848.365329695088
Alpha: 3 
 RMSE: 1947.8054231714427
Alpha: 5 
 RMSE: 7.651683599375543
Alpha: 10 
 RMSE: 7.649919713276496
Alpha: 20 
 RMSE: 7.650293617537982


In [22]:
teste = pd.read_csv("SUPCOM_Test.csv")
teste = teste[my_cols]

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X)

aux = imputer.transform(X)
X_pp = pd.DataFrame(aux, columns = X.columns)

aux = imputer.transform(teste)
teste_pp = pd.DataFrame(aux, columns = teste.columns)

model = Ridge(alpha = 10)
model.fit(X_pp, y)

p = model.predict(teste_pp)

teste = pd.read_csv("SUPCOM_Test.csv")

d = {'id': teste['id'], 'target': p} 

predictions = pd.DataFrame(d)
predictions.to_csv("submission.csv", index=False)

Zindi Result: 829.72325963182

In [20]:
print('Lasso Regularization')

df = pd.read_csv("SUPCOM_Train.csv")
df.dropna(thresh=df.shape[0]*0.3,how='all',axis=1, inplace = True) #remove de variables with more than 40% of NA values

X = df.drop(['target', 'id'], 1)
y = df['target']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = numerical_cols

X = X[my_cols]

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.3, random_state=0)

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(Xtrain)

aux = imputer.transform(Xtrain)
Xtrain_pp = pd.DataFrame(aux, columns = Xtrain.columns)

aux = imputer.transform(Xval)
Xval_pp = pd.DataFrame(aux, columns = Xval.columns)

alphas = [0.0001, 0.001, 0.01, 0.1]
for a in alphas:
    model = Lasso(alpha = a)
    model.fit(Xtrain_pp, ytrain)

    p = model.predict(Xval_pp)

    print(f'Alpha: {a} \n RMSE: {np.sqrt(mean_squared_error(yval, p))}')

Lasso Regularization
Alpha: 0.0001 
 RMSE: 7.465747371894358
Alpha: 0.001 
 RMSE: 7.468711408787674
Alpha: 0.01 
 RMSE: 7.4902725022842
Alpha: 0.1 
 RMSE: 7.527529878998623


In [23]:
teste = pd.read_csv("SUPCOM_Test.csv")
teste = teste[my_cols]

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X)

aux = imputer.transform(X)
X_pp = pd.DataFrame(aux, columns = X.columns)

aux = imputer.transform(teste)
teste_pp = pd.DataFrame(aux, columns = teste.columns)

model = Lasso(alpha = 0.0001)
model.fit(X_pp, y)

p = model.predict(teste_pp)

teste = pd.read_csv("SUPCOM_Test.csv")

d = {'id': teste['id'], 'target': p} 

predictions = pd.DataFrame(d)
predictions.to_csv("submission.csv", index=False)

Zindi Result: 6.99776981908721