In [13]:
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import cross_val_predict
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

In [14]:
# Load the test file
input_matrix = np.genfromtxt('handout_test.txt', delimiter=' ')

# Extract the y and x
x = input_matrix[:,1:]
y = input_matrix[:,0].reshape(20000,1)
y_raw = input_matrix[:,0]

In [15]:
# Do the transformation
scaler = StandardScaler()
poly = PolynomialFeatures(degree=2)
selection = PCA(n_components=38)

x_scale = scaler.fit_transform(x, y_raw)
x_trans = selection.fit_transform(x_scale,y_raw)
x_poly = poly.fit_transform(x_trans,y_raw)
x_transform = x_poly

In [16]:
# Do the regression with a linear regressor
regr = linear_model.LinearRegression()
predicted = cross_val_predict(regr, x_transform, y_raw, cv=10)

In [17]:
# Evaluate the score

FP = 0
FN = 0
TP = 0
TN = 0
prediction = np.sign(predicted)
for i in range(x_trans.shape[0]):
    if prediction[i] == y[i] and y[i] == 1:
        TP = TP + 1
    elif prediction[i] == y[i] and y[i] == -1:
        TN = TN + 1
    elif prediction[i] != y[i] and y[i] == 1:
        FN = FN + 1
    elif prediction[i] != y[i] and y[i] == -1:
        FP = FP + 1
    else:
        print("Something went wrong:", prediction[i], y)
    
print("TP:", TP)
print("TN:", TN)
print("FP:", FP)
print("FN:", FN)
print("accuracy:", (TP + TN)/x_trans.shape[0])

TP: 7867
TN: 8246
FP: 1775
FN: 2112
accuracy: 0.80565


In [9]:
np.set_printoptions(threshold=np.inf)
print(selection.components_)

[[  4.48931539e-02  -5.14290385e-02  -6.83148307e-03  -5.85443501e-02
    9.49969446e-02  -2.31178907e-02  -4.34230949e-02  -2.66806655e-03
    1.10571752e-02   9.84489532e-02  -6.38420890e-02  -5.36347549e-02
   -2.37709237e-02  -7.03185425e-02  -1.21163304e-02   1.72275627e-03
    5.72176295e-02   1.06210881e-01  -3.45958094e-02   3.45607228e-02
   -1.21310003e-02   8.10405125e-02  -6.52597047e-02  -7.90724720e-02
   -6.46438052e-02  -1.63047098e-02   2.16027493e-03  -5.57672065e-02
   -5.82646162e-03   4.60351048e-02   8.49409643e-02  -1.66388022e-02
    7.93342959e-03   2.20907207e-02   1.03625751e-01   1.02371887e-01
   -5.51833576e-02  -4.26740191e-02  -3.06718189e-02  -7.52864473e-04
   -1.66082019e-02  -7.25997847e-02   8.30896460e-04  -4.37502878e-02
    6.26493140e-02   6.46476556e-02  -3.30761217e-02  -4.03627267e-02
    2.36023713e-05   1.36506085e-03  -6.45668514e-02  -4.13036164e-02
   -5.06935677e-02  -7.21452047e-02   1.01128089e-01   8.82942491e-04
   -4.09982723e-02  

BASELINE
=====

MSE: 0.620598332416
TP: 7931
TN: 7870
FP: 2151
FN: 2048
accuracy: 0.79005

KBEST 200
=====