In [114]:
from src.features.build_features import get_roast_classification_dataset, get_vocab
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd

# Linear Algorithms

* Logistic Regression
* Naive Bayes
* Linear Discriminant Analysis

### Logistic Regression

In [82]:
X, y = get_roast_classification_dataset()

In [83]:
# Split into test and train datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

In [84]:
# generate the vocab list from only the TRAINING dataset
vocab = get_vocab(X_train, min_count=5)

In [85]:
# create tokenizer
t = Tokenizer(lower=True, split=' ')
# Fit tokenizer only on TRAINING data
t.fit_on_texts(vocab)
# convert x_train and x_test to count vectors
X_train = t.texts_to_matrix(X_train, mode='count')
X_test = t.texts_to_matrix(X_test, mode='count')

In [86]:
X_train.shape

(4492, 1584)

In [87]:
len(vocab)

1583

In [88]:
# since the features are sparse, use L1 regularization.  First start without regularization though.
# use balances class_weight since this is an imbalanced classification problem
lr_clf = LogisticRegression(class_weight='balanced', random_state=27, n_jobs=-1)
lr_clf.fit(X_train, y_train)

LogisticRegression(class_weight='balanced', n_jobs=-1, random_state=27)

In [89]:
lr_clf.score(X_test, y_test)

0.4541406945681211

40% of the training examples fall into the medium-light roast class.  This is an imbalanced classification problem.  The logistic regression only does a little better than that baseline.

In [90]:
# Try with L1 regularization
for c in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    lr_clf = LogisticRegression(penalty='l1', C=c, solver='liblinear', class_weight='balanced', random_state=27)
    lr_clf.fit(X_train, y_train)
    accuracy = lr_clf.score(X_test, y_test)
    print('accuracy: ', round(accuracy, 3), 'regularization parameter C: ', c) 

accuracy:  0.456 regularization parameter C:  0.1
accuracy:  0.476 regularization parameter C:  0.2
accuracy:  0.484 regularization parameter C:  0.3
accuracy:  0.489 regularization parameter C:  0.4
accuracy:  0.492 regularization parameter C:  0.5
accuracy:  0.501 regularization parameter C:  0.6
accuracy:  0.497 regularization parameter C:  0.7
accuracy:  0.492 regularization parameter C:  0.8
accuracy:  0.492 regularization parameter C:  0.9


Best performance is 50% accuracy with a regularization of 0.6.
Which features are important?

In [91]:
# fit best model
lr_clf = LogisticRegression(penalty='l1', C=0.6, solver='liblinear', class_weight='balanced', random_state=27)
lr_clf.fit(X_train, y_train)
lr_clf.score(X_test, y_test)

0.5013357079252003

### Naive Bayes

In [66]:
X, y = get_roast_classification_dataset()
# Split into test and train datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)
vocab = get_vocab(X_train, min_count=5)
# create tokenizer
t = Tokenizer(lower=True, split=' ')
# Fit tokenizer only on TRAINING data
t.fit_on_texts(vocab)
# convert x_train and x_test to count vectors
X_train = t.texts_to_matrix(X_train, mode='count')
X_test = t.texts_to_matrix(X_test, mode='count')

In [68]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
accuracy_score(y_true=y_test, y_pred=y_pred)

0.23686553873552982

The Naive Bayes model does very poorly.  Random guessing amond 6 classes (if they were equally weighted) would be 17%.

### Linear Discriminant Analysis

In [93]:
X, y = get_roast_classification_dataset()
# Split into test and train datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)
vocab = get_vocab(X_train, min_count=5)
# create tokenizer
t = Tokenizer(lower=True, split=' ')
# Fit tokenizer only on TRAINING data
t.fit_on_texts(vocab)
# convert x_train and x_test to count vectors
X_train = t.texts_to_matrix(X_train, mode='count')
X_test = t.texts_to_matrix(X_test, mode='count')

In [94]:
clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_true=y_test, y_pred=y_pred)

0.4692787177203918

In [96]:
for param in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    clf = LinearDiscriminantAnalysis(shrinkage=param, solver='eigen')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
    print('accuracy: ', round(accuracy, 3), 'shrinkage regularization parameter: ', param) 

accuracy:  0.521 shrinkage regularization parameter:  0.1
accuracy:  0.522 shrinkage regularization parameter:  0.2
accuracy:  0.516 shrinkage regularization parameter:  0.3
accuracy:  0.507 shrinkage regularization parameter:  0.4
accuracy:  0.5 shrinkage regularization parameter:  0.5
accuracy:  0.491 shrinkage regularization parameter:  0.6
accuracy:  0.479 shrinkage regularization parameter:  0.7
accuracy:  0.458 shrinkage regularization parameter:  0.8
accuracy:  0.441 shrinkage regularization parameter:  0.9


The LDA without regularization performs about as well as a logistic regression without regularization.  Regularized LDA performs on par with a regularized logistic regression.

## Non-Linear Algorithms

* k-nearest neighbors
* support vector machine

### K-Nearest Neighbors

Surprisingly, this model performs pretty much on par with the LDA and the logistic regression.  Maybe a little bit worse.

In [104]:
neigh = KNeighborsClassifier(n_neighbors=6, n_jobs=-1)
neigh.fit(X_train, y_train)

KNeighborsClassifier(n_jobs=-1, n_neighbors=6)

In [105]:
y_pred = neigh.predict(X_test)
y_pred

array([2, 2, 1, ..., 2, 2, 1], dtype=int8)

In [106]:
accuracy_score(y_true=y_test, y_pred=y_pred)

0.46126447016918964

### Support Vector Machine

SVMs are supposed to be effective in high-dimensional spaces. That is promising since my data is in many dimensions.

I need to scale my inputs for the SVM so that they have a mean of 0 and a unit variance of 1.  SVM's are not scale invariant.

In [112]:
X_train[0]

array([0., 1., 1., ..., 0., 0., 0.])

In [119]:
scaler = StandardScaler()
# Fit only on TRAINING DATA
scaler.fit(X_train)

StandardScaler()

In [117]:
X_scaled_train = scaler.transform(X_train)
X_scaled_test = scaler.transform(X_test)

In [122]:
# Use balanced mode since this is an unbalanced classification model
clf = svm.SVC(class_weight='balanced', C=1)
clf.fit(X_scaled_train, y_train)

SVC(C=1, class_weight='balanced')

In [123]:
y_pred = clf.predict(X_scaled_test)
y_pred

array([3, 2, 2, ..., 3, 0, 1], dtype=int8)

In [124]:
accuracy_score(y_true=y_test, y_pred=y_pred)

0.4416740872662511