In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import importlib

from itertools import product

All models assume the response variable is valued in {0, 1}.

In [19]:
from utils import get_numeric_data, get_text_data, compute_accuracy, prepare_numeric_data
from spectrum_kernel import prepare_kspectrum_data
from ridge_regression import RidgeRegression
from logistic_regression import LogisticRegression
from gaussian_ridge_regression import GaussianRidgeRegression
from gaussian_logistic_regression import GaussianLogisticRegression
from gaussian_svm import GaussianSVM


# # Uncomment this part if you modify any of the *.py files. This will allow the notebook to reload the files.
# import utils
# import spectrum_kernel
# import ridge_regression
# import logistic_regression
# import gaussian_ridge_regression
# import gaussian_logistic_regression
# import gaussian_svm

# importlib.reload(utils)
# importlib.reload(ridge_regression)
# importlib.reload(logistic_regression)
# importlib.reload(gaussian_ridge_regression)
# importlib.reload(gaussian_svm)
# importlib.reload(spectrum_kernel)


# print('Reloaded Modules.')


# 1) Ridge Regression

## Dataset 0

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=0,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = RidgeRegression(lam=28.828).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)


print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))


## Dataset 1

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=1,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = RidgeRegression(lam=23).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

## Dataset 2

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=2,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = RidgeRegression(lam=2.3).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

# 2) Logistic Regression

# Dataset 0

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=0,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = LogisticRegression(beta=0.9, noisy=False).fit(X_train, y_train)


in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

# Dataset 1

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=1,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = LogisticRegression(beta=0.9, noisy=False).fit(X_train, y_train)


in_sample_acc = compute_accuracy(model, X_train, y_train)

out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

# Dataset 2

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=2,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = LogisticRegression(beta=0.9, noisy=False).fit(X_train, y_train)


in_sample_acc = compute_accuracy(model, X_train, y_train)

out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

# 3) Gaussian Ridge Regression

## Dataset 0

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=0,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianRidgeRegression(lam=28, l=0.0095).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))


## Dataset 1

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=1,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianRidgeRegression(lam=23, l=0.15).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

## Dataset 2

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=2,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianRidgeRegression(lam=2.3, l=0.016).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

# 4) Gaussian Logistic Regression

## Dataset 0

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=0,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianLogisticRegression(lam=10, l=0.45).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

## Dataset 1

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=1,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianLogisticRegression(lam=10, l=1.2).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

## Dataset 2

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=2,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianLogisticRegression(lam=10, l=0.35).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))


# 5) Gaussian SVM

## Dataset 0

In [68]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=0,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianSVM(C=1, l=0.55).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

In  Sample: 1.0
Out Sample: 0.628


## Dataset 1

In [69]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=1,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianSVM(C=1, l=1.15).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

In  Sample: 0.887
Out Sample: 0.606


## Dataset 2

In [70]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=2,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianSVM(C=1.1, l=1.17).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

In  Sample: 0.914
Out Sample: 0.706


# 6) k-Spectrum kernels

## Dataset 0

In [27]:
X_train, y_train, X_val, y_val, X_te = prepare_kspectrum_data(
    dataset=0,
    path='data',
    k=6,
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianRidgeRegression(lam=0.12, l='auto').fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

In  Sample: 1.0
Out Sample: 0.522


In [21]:
X_train, y_train, X_val, y_val, X_te = prepare_kspectrum_data(
    dataset=0,
    path='data',
    k=3,
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = LogisticRegression().fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

In  Sample: 0.642
Out Sample: 0.562


In [22]:
X_train, y_train, X_val, y_val, X_te = prepare_kspectrum_data(
    dataset=0,
    path='data',
    k=3,
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianLogisticRegression(lam=10, l=0.001).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

In  Sample: 1.0
Out Sample: 0.516


# Submissions

## Submission #1

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=0,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianSVM(C=1, l=0.55).fit(X_train, y_train)
prediction_0 = model.predict_class(X_te)

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=1,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianSVM(C=1, l=1.15).fit(X_train, y_train)
prediction_1 = model.predict_class(X_te)

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=2,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianSVM(C=1.1, l=1.17).fit(X_train, y_train)
prediction_2 = model.predict_class(X_te)

In [None]:
prediction_arr = np.concatenate(list(map(lambda x: x.reshape(-1), [prediction_0, prediction_1, prediction_2])))
prediction_df = pd.DataFrame(prediction_arr)
prediction_df.rename(columns={0: 'Bound'}, inplace=True)
prediction_df

## Submission 2

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=0,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianLogisticRegression(lam=10, l=0.45).fit(X_train, y_train)
prediction_0 = model.predict_class(X_te)

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=1,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianSVM(C=1, l=1.15).fit(X_train, y_train)
prediction_1 = model.predict_class(X_te)

In [None]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=2,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = GaussianRidgeRegression(lam=2.3, l=0.016).fit(X_train, y_train)
prediction_2 = model.predict_class(X_te)

In [None]:
prediction_arr = np.concatenate(list(map(lambda x: x.reshape(-1), [prediction_0, prediction_1, prediction_2])))
prediction_df = pd.DataFrame(prediction_arr)
prediction_df.rename(columns={0: 'Bound'}, inplace=True)
prediction_df

# Submission 3

In [62]:
class VotingClassifier:
    def __init__(self, models):
        self.models = models
    
    def fit(self, X, y):
        self.models = [model.fit(X, y) for model in self.models]
        return self

    def predict_class(self, X):
        predictions = np.array([model.predict_class(X) for model in self.models])
        return (predictions.mean(axis=0) > 0.5).astype(int)


class AverageClassifier:
    def __init__(self, models):
        self.models = models
    
    def fit(self, X, y):
        self.models = [model.fit(X, y) for model in self.models]
        return self
    
    def predict(self, X):
        return np.array([model.predict(X) for model in self.models]).mean(axis=0)
    
    def predict_class(self, X):
        return ((1 + np.sign(self.predict(X))) / 2).astype(int)


## Dataset 0

In [84]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=0,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = AverageClassifier(models=[
    GaussianSVM(C=1, l=0.55),
    GaussianLogisticRegression(lam=10, l=0.45),
]).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

prediction_0 = model.predict_class(X_te)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

In  Sample: 1.0
Out Sample: 0.628


## Dataset 1

In [85]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=1,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = AverageClassifier(models=[
    GaussianSVM(C=1, l=1.15),
    GaussianRidgeRegression(lam=23, l=0.15),   
]).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

prediction_1 = model.predict_class(X_te)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

In  Sample: 0.886
Out Sample: 0.602


## Dataset 2

In [86]:
X_train, y_train, X_val, y_val, X_te = prepare_numeric_data(
    dataset=2,
    path='data',
    scale_method='minmax',
    with_intercept=False,
    num_train=1500,
)

model = AverageClassifier(models=[
    GaussianSVM(C=1.1, l=1.17),
    GaussianRidgeRegression(lam=2.3, l=0.016),   
]).fit(X_train, y_train)

in_sample_acc = compute_accuracy(model, X_train, y_train)
out_sample_acc = compute_accuracy(model, X_val, y_val)

prediction_2 = model.predict_class(X_te)

print('In  Sample:', round(in_sample_acc, 3))
print('Out Sample:', round(out_sample_acc, 3))

In  Sample: 0.949
Out Sample: 0.708


In [87]:
prediction_arr = np.concatenate(list(map(lambda x: x.reshape(-1), [prediction_0, prediction_1, prediction_2])))
prediction_df = pd.DataFrame(prediction_arr)
prediction_df.rename(columns={0: 'Bound'}, inplace=True)
prediction_df

Unnamed: 0,Bound
0,1
1,1
2,1
3,1
4,0
...,...
2995,0
2996,0
2997,0
2998,0


# FOR ANY SUBMISSION COMPARE AGAINST THE ALREADY SUBMITTED FILES

In [91]:
a1 = pd.read_csv('submission1.csv', index_col='Id')
a2 = pd.read_csv('submission2.csv', index_col='Id')
a3 = pd.read_csv('submission3.csv', index_col='Id')

(a1 != a3).sum()


a3[a3 != a1]

Unnamed: 0_level_0,Bound
Id,Unnamed: 1_level_1
0,
1,
2,
3,
4,
...,...
2995,
2996,
2997,
2998,


In [107]:
(a3['Bound'] != a1['Bound']).mean()


0.006666666666666667