In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd.variable import Variable
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn import metrics
from skorch import NeuralNetClassifier
import warnings
import random
warnings.filterwarnings(action='ignore')

In [25]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Add additional data

In [26]:
correct = pd.read_csv("new_true_scores_compared_to_old_with_int_index.csv")
df_temp = df_test.set_index('id').join(correct[["id", "assumed_true_score"]].set_index('id'))
df_temp = df_temp.dropna()
df_temp.reset_index(level=0, inplace=True)
df_temp.rename(columns = {"assumed_true_score" : "time"},inplace=True)
df = pd.concat([df,df_temp],axis=0)

# Feature engineering

In [27]:
categorical_features = ["penalty","n_jobs", "alpha", "n_classes", "n_informative", "n_clusters_per_class"]

labels = df["time"].values
df = df.drop(["time"], axis=1)
continuous_features = [key for key in dict(df.dtypes) if dict(df.dtypes)[key] in ['float64', 'int64']]
continuous_features = [x for x in continuous_features if x not in categorical_features]

# change values in some columns because I found that there is no difference to the final result if those values are changed
df_len = len(df)
temp = pd.concat([df, df_test])
temp.loc[temp["n_jobs"] == -1, "n_jobs"] = 16
temp.loc[temp["penalty"] == "elasticnet", "penalty"] = "l1"
temp.loc[temp["penalty"] == "l2", "penalty"] = "none"

# Create a new column to emphasize the importance of some columns
new_col = pd.Series(temp["n_samples"] * temp["n_features"] / temp["n_jobs"],name="samp_times_features_div_njobs")
temp = pd.concat( [temp,new_col], axis=1)
continuous_features.append("samp_times_features_div_njobs")

temp[categorical_features] = temp[categorical_features].astype("category")
temp[continuous_features] = temp[continuous_features].astype("float32")

# Drop some columns because they are not related to the time
to_drop = [
    "id",
    "random_state",
    "l1_ratio",
    "alpha",
    "flip_y",
    "scale",
    "n_clusters_per_class",
    "n_informative"
]
categorical_features = [x for x in categorical_features if x not in to_drop]
continuous_features = [x for x in continuous_features if x not in to_drop]

temp_cat = temp[categorical_features]
temp = temp.drop(to_drop, axis=1)

# Scaling continuous variables and one-hot encoding categorical

In [28]:
scaler = preprocessing.StandardScaler(with_mean = 0, with_std = 1)
scaled_temp = scaler.fit(temp[continuous_features].iloc[:df_len].values)
scaled_temp = scaler.transform(temp[continuous_features].values)

temp_continuous = pd.DataFrame(scaled_temp, index=temp[continuous_features].index, columns=temp[continuous_features].columns)
one_hot = pd.get_dummies(temp[categorical_features])

temp = pd.concat([pd.DataFrame(temp_continuous), (one_hot)],axis=1)

In [29]:
df = temp.iloc[:df_len]
df_test = temp.iloc[df_len:]

# pytorch

In [30]:
predic_list = []

In [31]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.l1 = nn.Linear(20,100)
        self.l2 = nn.Linear(100,10)
        self.l3 = nn.Linear(10,1)
    
    def forward(self, x, **kwargs):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = (self.l3(x))
        return x

# My implementation of cross validation for pytorch

In [32]:
np.random.seed(42)
indx = np.random.permutation(len(labels))
size = int(len(labels)*0.10)
test_index = [indx[i * size : (i+1) * size] for i in range(10)]
train_index = [np.setdiff1d(indx,test_index[i]) for i in range(10)]
# indx[0 * size : (0+1) * size]
lis = list(zip(train_index,test_index))

In [35]:
loss_fn = torch.nn.MSELoss()
for train_index, test_index in lis:
    X_train = Variable(torch.tensor(df.values[train_index]).float())
    y_train = Variable(torch.tensor(labels[train_index].reshape(-1,1)).float())
    X_test = Variable(torch.tensor(df.values[test_index]).float())
    model = Model()
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0005)
    for i in range(15000):
        y_pred = model(X_train)
        loss = loss_fn(y_pred, y_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    preds = model(X_test)
    loss = metrics.mean_squared_error(preds.detach().numpy(), labels[test_index])
    print(loss)

1.2043107595976739
1.4475563122736557
0.263923613536452
0.38620975670180735
2.0293261800499516
0.6599051708157685
1.2308950604428193
0.7988324035058908
0.29729041094352976
1.2128143540262823


In [36]:
torch_test = Variable(torch.tensor(df_test.values).float())
preds = model(torch_test)
np.round(preds.squeeze(1).detach().numpy(),2)

array([ 1.610e+00,  1.023e+01,  2.200e-01,  1.710e+00,  3.430e+00,
        9.220e+00,  2.510e+00,  6.100e-01,  1.493e+01,  4.900e-01,
        5.290e+00,  1.345e+01,  1.010e+00,  3.266e+01,  2.600e-01,
        6.400e-01,  6.100e-01,  6.740e+00,  3.250e+00,  4.200e+00,
        2.200e-01,  5.200e-01,  6.300e-01,  8.100e-01,  1.150e+00,
        2.420e+00,  1.320e+00,  3.520e+00,  2.110e+00,  1.740e+00,
        2.049e+01,  3.120e+00,  3.900e-01,  1.234e+01,  6.840e+00,
        1.060e+00,  3.850e+00,  2.750e+00, -3.000e-02,  2.160e+00,
        5.280e+00,  1.186e+01,  7.600e-01,  4.700e+00,  6.500e-01,
        1.470e+00,  3.740e+00,  5.100e-01,  9.020e+00,  7.500e-01,
        2.200e-01,  1.964e+01,  2.200e-01,  7.050e+00,  9.760e+00,
        2.200e-01,  1.010e+00,  1.040e+00,  1.120e+00,  3.600e-01,
        1.810e+00,  6.490e+00,  7.700e-01,  7.040e+00,  3.710e+00,
        9.140e+00,  1.220e+00,  7.300e-01,  8.220e+00,  1.250e+00,
        2.200e-01,  1.110e+00,  1.111e+01,  2.200e-01,  5.990e

#### If results don't have negative values in them and seem ok append to the list

In [37]:
predic_list.append(preds.squeeze(1).detach().numpy())
# Take average
my_pred = [np.mean(i) for i in zip(*predic_list)]

In [305]:
res_csv = pd.DataFrame(my_pred,columns=["time"])
res_csv.to_csv("submission1.csv", index = True,index_label="id", header=True)