## Import Tools

In [59]:
import csv
import lisData
import datetime
import numpy as np
import sklearn.linear_model as sklin
import sklearn.metrics as skmet
import sklearn.cross_validation as skcv
import sklearn.grid_search as skgs
import sklearn.preprocessing as skpr
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

## Data Fetching

#### Alternative 1

In [5]:
data = lisData.LisData('train.csv',15,1)
print(data.data[34].x)

[ 1.89526027  2.37496407 -0.33221165 -0.96862469  0.33994368  1.17581895
  1.49869027  2.04797216  1.28427967  2.74266878  0.77739432  0.08692078
 -0.68121822  0.11102864  1.24211796]


#### Alternative 2

In [60]:
trainx = []
trainy = []
testx = []
nOutput = 1
nInput = 15
with open('train.csv','r') as inputFile:
    leReader = csv.reader(inputFile)
    next(leReader)
    for row in leReader:
        id = int(row[0])
        y = np.array(row[1:nOutput+1]).astype(np.float)
        x = np.array(row[nOutput+1:nInput+nOutput+1]).astype(np.float)
        trainx.append(x)
        trainy.append(y)
        
with open('test.csv','r') as inputFile:
    leReader = csv.reader(inputFile)
    next(leReader)
    for row in leReader:
        id = int(row[0])
        x = np.array(row[1:nInput+1]).astype(np.float)
        testx.append(x)

print(trainx[34])
print(testx[34])

[ 1.89526027  2.37496407 -0.33221165 -0.96862469  0.33994368  1.17581895
  1.49869027  2.04797216  1.28427967  2.74266878  0.77739432  0.08692078
 -0.68121822  0.11102864  1.24211796]
[ 0.62222615  0.24094433  0.98483006  0.11788056  1.59518869  0.91472915
  1.97502096 -1.02646307  1.32399956 -0.54682658  0.39639841  2.27331498
  0.48291622  2.38862851  3.01288612]


## Actual Fitting

In [22]:
clf = sklin.LinearRegression()
clf.fit (trainx, trainy)
clf.coef_
testy = clf.predict(testx)

print(testy)

[[  7.29022687]
 [-34.4230165 ]
 [ 82.29394959]
 ..., 
 [ 98.66763217]
 [ 90.16475902]
 [ -7.44481286]]


In [51]:
clf = sklin.Ridge (alpha = 0.1)
clf.fit (trainx, trainy)
clf.coef_
predicty = clf.predict(trainx)
testy = clf.predict(testx)
print(score(trainy,predicty))
print(testy)

37.1268828115
[[  7.29382659]
 [-34.41587605]
 [ 82.28696405]
 ..., 
 [ 98.66067416]
 [ 90.15827212]
 [ -7.44084178]]


In [53]:
def RMSE(y, ypred):
    return mean_squared_error(y, ypred)**0.5
scorefun = skmet.make_scorer(score)
scores = skcv.cross_val_score(clf, trainx, trainy, scoring=scorefun, cv=900)
print(scores)
print(np.mean(scores))
print(np.std(scores))

[  1.45628118e+01   1.22188199e+01   7.86744733e+00   2.42048995e+01
   5.53057344e+01   3.37015797e+01   4.44316996e+01   7.26875050e+01
   6.49188908e+00   3.18318323e+01   7.02584958e+01   5.55750910e+00
   2.96824700e+01   4.27474935e+01   6.56764950e+00   3.46319843e+00
   1.68061931e+01   2.68032785e+00   3.57901860e+01   2.88953244e+01
   3.79654008e+01   3.68094829e+00   1.35549348e+01   9.09982342e+00
   2.90848167e+00   1.26391051e+01   6.91654592e+00   1.87977310e+01
   7.69955809e+00   2.75486491e+01   2.22287183e+01   6.67164794e+01
   1.83188370e+01   6.26821490e-01   2.49811764e+01   4.09973377e+01
   5.58809981e+00   2.46581535e+01   3.59964007e+01   1.48104481e+01
   4.71501857e+00   1.40384052e+01   2.73164796e+01   1.81040370e+01
   2.08341741e+01   9.59268824e+00   4.65391523e+01   9.48459321e+00
   3.70639438e+01   1.09475356e+01   1.60639183e+01   6.90799040e+00
   1.04439708e+01   1.74542305e+01   3.18898460e+01   2.49045554e+01
   5.39832594e+00   1.26398377e+01

In [54]:
min(scores)

0.0026762826191362166

### Finding best degree K

In [66]:
Kscores = []
def rmse(y, ypred):
    return mean_squared_error(y, ypred)**0.5
RMSE = skmet.make_scorer(rmse)
for K in range(6):
    poly = skpr.PolynomialFeatures(K)
    xtraining = poly.fit_transform(trainx)
    clf = sklin.RidgeCV(scoring=RMSE)

    # Evaluate the models using crossvalidation
    scores = skcv.cross_val_score(clf,xtraining, trainy, scoring=RMSE, cv=900)
    
    
    print(K)
    print(np.mean(scores))
    print(scores)
    print(np.std(scores))
    Kscores.append([np.mean(scores),np.std(scores)])
    


0
39.7556358793
[  2.33970937e+01   3.52559858e+01   7.30227787e+01   1.80866221e+01
   2.39281945e+01   2.81368812e+01   7.97642212e+01   1.71646214e+02
   1.68398648e+01   1.00439687e+02   7.47572934e+01   4.41709796e+01
   5.99647958e+01   5.08987652e+01   1.01090623e+01   4.01128247e+01
   7.11327613e+00   1.07637596e+01   1.14482948e+00   3.79118567e+01
   1.05358525e+00   5.04800810e+01   5.13185778e+01   3.13782792e+01
   1.12024594e+01   1.44406174e+01   3.79304990e+01   6.12568600e+01
   1.96612958e+01   1.85245957e+00   1.05691295e+01   4.80382981e+01
   2.51629321e+01   6.51091889e+00   2.74516507e+01   7.22346852e+01
   3.43517574e+00   9.89106724e-01   1.82595401e+01   3.37886694e+01
   2.07917723e+01   2.59482259e+01   9.20062010e+01   2.54952977e+01
   1.07294696e+01   7.49907379e+01   5.53107266e+01   1.99244924e+01
   6.70462357e+00   6.04878250e+01   2.69185759e+01   3.55542276e+01
   5.05185256e+01   3.35222904e+01   3.98515474e+01   1.30404075e+01
   4.55178011e+01 

KeyboardInterrupt: 

### Working with degree K=2

In [82]:
def rmse(y, ypred):
    return mean_squared_error(y, ypred)**0.5
RMSE = skmet.make_scorer(rmse)
poly = skpr.PolynomialFeatures(2)
xtraining = poly.fit_transform(trainx)

#clf = sklin.RidgeCV(alphas=(0.00001,0.0001,100.00001),scoring=RMSE)
clf = sklin.LinearRegression()
clf.fit(xtraining,trainy)
xtesting = poly.fit_transform(testx)
testy = clf.predict(xtesting)
print(rmse(trainy,clf.predict(xtraining)))

19.4746256795


## Writing Results

In [78]:
now =datetime.datetime.now()
with open('output_{0}.csv'.format(now.strftime('%Y%m%d%H')), 'w') as outFile:
    outFile.write('Id,y\n')
    for idx,testyelem in enumerate(testy):
        #outFile.write("%d,%s\n"%(idx+900,format(testyelem,'.30f')))    
        outFile.write("%d,%.100f\n"%(idx+900,testyelem))