In [1]:
from sklearn import linear_model
import numpy as np
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pylab as pl

In [2]:
def file_to_numpy(filename):
    """
    Read an input file and convert it to numpy
    """
    df = pd.read_csv(filename)
    return df.to_numpy()

file = pd.read_csv("train.csv")
y = file.iloc[:, 1:2]
x = pd.read_csv("x_transformed.csv")
x = x.iloc[:, 1:]

xHatNames = x.head(0)
xHatNames = xHatNames.T
x = x.to_numpy()
y = y.to_numpy()
y = y.T[0]
# print(y)
# print(len(x[0]))
# print(xHatNames)

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.3, random_state=40)

In [5]:
# the best parameters
clf = GridSearchCV(
        SVR(), 
        [{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
          'degree': [2, 3, 4], 'C': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 
          'epsilon': [0.1, 0.15, 0.2, 0.25, 0.3]}], cv=3, scoring='r2')
clf.fit(xTrain, yTrain)
means = clf.cv_results_['mean_test_score']
for mean, params in zip(means, clf.cv_results_['params']):
    print("%0.3f for %r" % (mean, params))
print("Best parameters set found on development set:")
print()
print(clf.best_params_)

0.550 for {'C': 0.5, 'degree': 2, 'epsilon': 0.1, 'kernel': 'linear'}
0.500 for {'C': 0.5, 'degree': 2, 'epsilon': 0.1, 'kernel': 'poly'}
0.462 for {'C': 0.5, 'degree': 2, 'epsilon': 0.1, 'kernel': 'rbf'}
0.430 for {'C': 0.5, 'degree': 2, 'epsilon': 0.1, 'kernel': 'sigmoid'}
0.550 for {'C': 0.5, 'degree': 2, 'epsilon': 0.15, 'kernel': 'linear'}
0.500 for {'C': 0.5, 'degree': 2, 'epsilon': 0.15, 'kernel': 'poly'}
0.462 for {'C': 0.5, 'degree': 2, 'epsilon': 0.15, 'kernel': 'rbf'}
0.431 for {'C': 0.5, 'degree': 2, 'epsilon': 0.15, 'kernel': 'sigmoid'}
0.550 for {'C': 0.5, 'degree': 2, 'epsilon': 0.2, 'kernel': 'linear'}
0.500 for {'C': 0.5, 'degree': 2, 'epsilon': 0.2, 'kernel': 'poly'}
0.462 for {'C': 0.5, 'degree': 2, 'epsilon': 0.2, 'kernel': 'rbf'}
0.431 for {'C': 0.5, 'degree': 2, 'epsilon': 0.2, 'kernel': 'sigmoid'}
0.550 for {'C': 0.5, 'degree': 2, 'epsilon': 0.25, 'kernel': 'linear'}
0.500 for {'C': 0.5, 'degree': 2, 'epsilon': 0.25, 'kernel': 'poly'}
0.462 for {'C': 0.5, 'degree

In [11]:
# R-squaer and mean square error
clf=SVR(C=0.6, degree=2, epsilon=0.3, kernel="linear")
clf.fit(xTrain, yTrain)
yHat = clf.predict(xTest)
print("r2_score:")
print(r2_score(yTest, yHat))
print("mean square error:")
print(mean_squared_error(yTest, yHat, squared=False))

r2_score:
0.47510730161827097
mean square error:
9.532687769370472


In [15]:
# model analysis
# We will choose 50 features with the largest constants in linear regression model
# They are the 50 most important features in the model
print("feature coefficients of the model")
print(clf.coef_)
print()
sort_index = np.argsort(np.abs(clf.coef_))
print("index of the best 30 features")
print(sort_index[:,:30])

feature coefficients of the model
[[ 0.00000000e+00  1.20000000e+00  6.00000000e-01  0.00000000e+00
   1.23508278e+00  8.90826761e-01 -5.41945894e-01 -2.23655335e+00
   1.32317351e+00  5.05828013e-01 -1.57908552e+00 -6.00000000e-01
   1.00750128e+00 -1.20000000e+00  4.93301547e-01  5.45239700e-01
   1.88653804e+00  1.47718187e+00  1.57022800e+00 -2.54688881e+00
  -1.45340464e+00  2.28795376e-01  7.51975106e-01 -7.30992729e-01
   1.79014408e-01 -2.61321283e+00  2.07779057e+00 -1.59234349e+00
   4.27647005e-01 -1.20070026e+00  4.37432568e-01  1.60536608e+00
  -1.20000000e+00 -3.13731123e+00 -1.14305772e-01  1.00052674e+00
  -1.94265434e+00 -1.56515425e-01  1.33870041e+00  1.51262789e+00
  -1.45579767e+00 -1.83782133e+00 -1.35221784e+00  2.43511993e+00
   1.97435376e+00  1.61844145e+00 -8.30941677e-01  1.43227358e-01
  -3.15912624e-02 -5.97678301e-01 -4.86772984e-01  6.49046006e-02
   0.00000000e+00 -6.39686133e-01  1.10318290e+00 -4.70214105e-01
   2.87112810e+00 -1.53554032e-01  4.50186