In [1]:
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder

In [13]:
import h5py
from collections import Counter

In [3]:
with h5py.File('data/all_data.hdf5', "r", libver='latest') as f:
    Xtrain = f['X_train'].value
    ytrain = f['y_train'].value
    Xtest = f['X_test'].value
    ytest = f['y_test'].value

In [4]:
Xtrain

array([[  5,  11,  13, ..., 201, 278, 294],
       [  5,  11,  13, ..., 205, 232, 279],
       [  5,  11,  13, ..., 209, 231, 279],
       ..., 
       [  2,  11,  13, ..., 201, 226, 279],
       [  2,  11,  13, ..., 201, 224, 279],
       [  2,  11,  13, ..., 223, 224, 279]], dtype=int32)

In [5]:
ytrain

array([[9, 1],
       [8, 1],
       [5, 1],
       ..., 
       [1, 1],
       [1, 1],
       [1, 1]], dtype=int32)

In [6]:
logistic = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs',  tol=1e-10)

In [7]:
Xtrain = Xtrain[:, range(0,10)]
Xtest = Xtest[:, range(0,10)]

In [8]:
encoder = OneHotEncoder()
Xtrainsparse = encoder.fit_transform(Xtrain)
Xtestsparse = encoder.transform(Xtest)

In [9]:
ytrain = ytrain[:, 0]
ytest = ytest[:, 0]

In [10]:
logistic.fit(Xtrainsparse, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          penalty='l2', random_state=None, solver='lbfgs', tol=1e-10,
          verbose=0)

In [17]:
logistic.score(Xtrainsparse, ytrain)

0.39589026697498425

In [11]:
logistic.score(Xtestsparse, ytest)

0.3489971346704871

In [14]:
Counter(list(logistic.predict(Xtestsparse)))

Counter({1: 610,
         2: 53,
         3: 457,
         4: 72,
         5: 103,
         6: 4,
         7: 1,
         8: 81,
         9: 163,
         10: 50,
         11: 76,
         13: 26,
         14: 22,
         15: 4,
         16: 1,
         18: 17,
         21: 1,
         22: 1,
         24: 2,
         33: 1})

#### What if you predicted the most freqeuent class every time?

In [15]:
Counter(ytest)[1] / float(len(ytest))

0.21891117478510028

In [16]:
Counter(ytrain)[1] / float(len(ytrain))

0.1877233550557074

So guessing the I chord every time would give you about 18% accuracy on the training set and 22% accuracy on the test set.

## Naive Bayes

In [19]:
from sklearn import naive_bayes

In [20]:
nb = naive_bayes.MultinomialNB(alpha=0.1, fit_prior=True, class_prior=None)

In [21]:
nb.fit(Xtrainsparse, ytrain)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [22]:
nb.score(Xtestsparse, ytest)

0.30487106017191978

In [23]:
nb.score(Xtrainsparse, ytrain)

0.35563380281690143

In [220]:
Counter(list(nb.predict(Xtestsparse)))

Counter({1: 431,
         2: 111,
         3: 356,
         4: 83,
         5: 173,
         6: 44,
         7: 17,
         8: 96,
         9: 138,
         10: 59,
         11: 67,
         12: 2,
         13: 30,
         14: 32,
         15: 5,
         16: 6,
         17: 3,
         18: 51,
         20: 3,
         21: 6,
         22: 17,
         24: 2,
         28: 1,
         30: 1,
         32: 4,
         33: 4,
         34: 1,
         36: 1,
         53: 1})

## Oracle experiment

In [221]:
with h5py.File('data/all_data.hdf5', "r", libver='latest') as f:
    Xtrainor = f['X_train'].value
    ytrainor = f['y_train'].value
    Xtestor = f['X_test'].value
    ytestor = f['y_test'].value

In [222]:
ytrainor = ytrainor[:, 0]
ytestor = ytestor[:, 0]

In [223]:
encoder = OneHotEncoder()
Xtrainorsparse = encoder.fit_transform(Xtrainor)
Xtestorsparse = encoder.transform(Xtestor)

In [224]:
logistic = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs',  tol=1e-10)

In [225]:
logistic.fit(Xtrainorsparse, ytrainor)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          penalty='l2', random_state=None, solver='lbfgs', tol=1e-10,
          verbose=0)

In [226]:
logistic.score(Xtestorsparse, ytestor)

0.430945558739255

In [235]:
prevharmmin = Xtestor[:, 9].min()
prevharmax = Xtestor[:, 9].max()

In [247]:
correct = 0.
prev = None
for idx, x in enumerate(Xtestor):
    if idx > 1:
        x[10] = prev + prevharmmin - 1
    inx = encoder.transform(x)
    if logistic.predict(inx)[0] == ytestor[idx]:
        correct += 1
    prev = logistic.predict(inx)[0]
print correct / len(Xtestor)

0.327220630372
