In [1]:
import csv
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn import metrics
from sklearn import ensemble
from sklearn import linear_model

In [2]:
def write_result(result):
    result = result.reshape((result.shape[0],1))
    result = result.astype(float)
    result = result.astype(int)
    with open('testlabel.csv','w',newline='') as f:
        writer = csv.writer(f, delimiter = ',')
        for row in result:
            writer.writerow(row)

In [3]:
# read data
train_X = []
with open('traindata.csv','r') as f:
    reader = csv.reader(f,delimiter = ',')
    for row in reader:
        train_X.append(row)
        
train_Y = []
with open('trainlabel.csv','r') as f:
    reader = csv.reader(f,delimiter = ',')
    for row in reader:
        train_Y.append(row)
train_Y = np.array(train_Y)
train_Y = train_Y.reshape((train_Y.shape[0],))
        
test_X = []
with open('testdata.csv','r') as f:
    reader = csv.reader(f,delimiter = ',')
    for row in reader:
        test_X.append(row)

In [4]:
# normalize data
scaler = StandardScaler()
train_X_normed = scaler.fit_transform(train_X)
test_X_normed = scaler.fit_transform(test_X)

In [5]:
# split training data in to training set and validation set
trainX, validX, trainY, validY = train_test_split(train_X_normed, train_Y, test_size=0.1, random_state=17)

In [10]:
# SVM
paramgrid = {'C': np.logspace(-3,3,13)}
svmcv = GridSearchCV(estimator=svm.SVC(kernel='linear',random_state=17), param_grid = paramgrid, cv=10,verbose=1)
svmcv.fit(trainX, trainY)

Fitting 10 folds for each of 13 candidates, totalling 130 fits


[Parallel(n_jobs=1)]: Done 130 out of 130 | elapsed: 25.2min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=17, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   3.16228e-03,   1.00000e-02,   3.16228e-02,
         1.00000e-01,   3.16228e-01,   1.00000e+00,   3.16228e+00,
         1.00000e+01,   3.16228e+01,   1.00000e+02,   3.16228e+02,
         1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [11]:
svmcv.best_score_

0.92719116632160115

In [12]:
predY = svmcv.predict(validX)
acc = metrics.accuracy_score(validY, predY)
print ("test accuracy = " + str(acc))

test accuracy = 0.937888198758


In [6]:
# Random Forest
paramgrid = {'n_estimators': np.array([1,2,3,5,7, 10, 13, 15, 20, 27, 35,50,80, 100]) }
rfcv = GridSearchCV(ensemble.RandomForestClassifier(random_state=71),paramgrid, cv=10)
rfcv.fit(trainX, trainY)
predY = rfcv.best_estimator_.predict(validX)

In [7]:
acc = metrics.accuracy_score(validY, predY)
print('Accuracy on test-set: ' + str(acc))


Accuracy on test-set: 0.950310559006


In [73]:
# adaboost
paramgrid = {'n_estimators': np.array([1,3,5,7, 10, 13, 15, 20, 33,50,80, 100]) }
adacv = GridSearchCV(ensemble.AdaBoostClassifier(random_state=7),paramgrid, cv=5)
adacv.fit(trainX, trainY)
predY = adacv.best_estimator_.predict(validX)
acc = metrics.accuracy_score(validY, predY)
print('Accuracy on test-set: ' + str(acc))

Accuracy on test-set: 0.94099378882


In [9]:
# make prediction and write to file
write_result(rfcv.best_estimator_.predict(test_X_normed))