In [131]:
import pandas as pd
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pickle as pkl
from xgboost import XGBClassifier
import json
from svmclassifier import SVMClassifier
from nnclassifier import NNClassifier


## Load the one-hot encoded data
Use one-hot token method, since this performed better than category using the taxonomy  
`onehot_feature.json` is created using `onehot.py`

In [115]:
with open("../data/onehot_feature.json", 'r') as f:
    users = json.load(f)

### Remove "on target" class

In [130]:
data = [user for user in users if user["label"] != 2]

In [133]:
labels = [user["label"] for user in data]
X = [np.array(user["feature"]) for user in data]
le = LabelEncoder()
y = le.fit_transform(labels)

## SVM with one-hot encoding input

In [140]:
scoring = {'accuracy':'accuracy',
           'precision':make_scorer(precision_score,average='micro'),
           'recall':make_scorer(recall_score,average='micro')}

In [141]:
svc = SVC(kernel='linear')
svc_scores_one_hot = cross_validate(svc,X,y,scoring=scoring,cv=3)

In [142]:
for metric in ['test_accuracy','test_precision','test_recall']:
    print('\n' + metric)
    print('Average ' + metric)
    print(round(np.mean(svc_scores_one_hot[metric]),2))
    print('Std of ' + metric)
    print(round(np.std(svc_scores_one_hot[metric]),4))


test_accuracy
Average test_accuracy
0.6608165790049986
Std of test_accuracy
0.007556640457673412

test_precision
Average test_precision
0.6608165790049986
Std of test_precision
0.007556640457673412

test_recall
Average test_recall
0.6608165790049986
Std of test_recall
0.007556640457673412


## MLP with one-hot encoding input

In [None]:
mlp = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
                             beta_2=0.999, early_stopping=False, epsilon=1e-08,
                             hidden_layer_sizes=(200), learning_rate='constant',
                             learning_rate_init=0.001, max_iter=200, momentum=0.9,
                             nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
                             solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
                             warm_start=False)

In [None]:
mlp_scores_one_hot = cross_validate(mlp,X,y,scoring=scoring,cv=3)

## Prepare vectorized data for classification

### Load the model input
This input is constructed using `construct_classifier_input_data.ipynb`

In [64]:
data = pd.read_csv('../data/food_vectors_w2v.csv',header=None)
data = data.rename(columns={0:'user',1:'date',102:'goal',103:'actual',104:'label'})

### Preprocessing and aggregation by user

Following the original paper, we discard days where less than 100 calories were logged

In [65]:
data = data[data['actual'] > 100]

### Remove "on target" class
Following the original paper, we discard the 'on_target' label, simplifying the problem to binary classification

In [66]:
data = data[data['label'] != 'on_target']

In [148]:
# data.head()

### Aggregate the data by user
We group the data by user, aggregate the vectors using the mean, and aggregate the users by taking the modal class.  
I.e. if most days that are recorded for a given user have been labeled "below", we label this user as "below"

In [68]:
groups = data.groupby('user')

Display the number of users with more than 30 days logged

In [None]:
len(groups['user'].count()[lambda x: x >= 30].index)

Construct a new dataframe of the aggregate data

In [73]:
grouped_y = groups['label'].agg(lambda x:x.value_counts().index[0])
grouped_x = groups.mean()
grouped = grouped_x.copy()
grouped['label'] = grouped_y
grouped_above_30 = grouped.loc[groups['user'].count()[lambda x: x >= 30].index,:]

### Check the number of users in each class

In [76]:
print("Above: " + str(sum(grouped_above_30['label'] == 'above')))
print("Below: " + str(sum(grouped_above_30['label'] == 'below')))

Above: 1303
Below: 3478


### Balance the classes
We should try classification with the classes balanced, as the original paper did

In [None]:
# balance the classes here

### Split the data into model input and output

In [145]:
# grouped.head()

In [78]:
y = grouped_above_30.loc[:,'label']
X = grouped_above_30.iloc[:,0:100]

In [146]:
# X.head()

In [147]:
# y.head()

## SVM using vector input

In [143]:
scoring = {'accuracy':'accuracy',
           'precision':make_scorer(precision_score,average='micro'),
           'recall':make_scorer(recall_score,average='micro')}

In [144]:
svc = SVC(kernel='linear')
svc_scores = cross_validate(svc,X,y,scoring=scoring,cv=5)

In [151]:
for metric in ['test_accuracy','test_precision','test_recall']:
    print('\n' + metric)
    print('Average ' + metric)
    print(round(np.mean(svc_scores[metric]),2))
    print('Std of ' + metric)
    print(round(np.std(svc_scores[metric]),4))


test_accuracy
Average test_accuracy
0.66
Std of test_accuracy
0.011

test_precision
Average test_precision
0.66
Std of test_precision
0.011

test_recall
Average test_recall
0.66
Std of test_recall
0.011


## FCN using vector input

In [109]:
mlp = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
                             beta_2=0.999, early_stopping=False, epsilon=1e-08,
                             hidden_layer_sizes=(200), learning_rate='constant',
                             learning_rate_init=0.001, max_iter=200, momentum=0.9,
                             nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
                             solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
                             warm_start=False)
mlp_scores = cross_validate(mlp,X,y,scoring=scoring,cv=3,verbose=1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   16.4s finished


In [110]:
for metric in ['test_accuracy','test_precision','test_recall']:
    print('\n' + metric)
    print('Average ' + metric)
    print(np.mean(mlp_scores[metric]))
    print('Std of ' + metric)
    print(np.std(mlp_scores[metric]))


test_accuracy
Average test_accuracy
0.7134519426356559
Std of test_accuracy
0.004883994427665733

test_precision
Average test_precision
0.6333429769555584
Std of test_precision
0.0038402151501620244

test_recall
Average test_recall
0.6261872793181188
Std of test_recall
0.0034801098281369482


# Save models and performance results to file

In [62]:
pkl.dump(svc,open('../predictive_models/svm_model.pkl','wb'))
pkl.dump(scores,open('../predictive_models/svm_5_fold_scores.pkl','wb'))

# Feature importances using vectors

In [153]:
def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center')
    plt.yticks(range(len(names)), names)
    plt.show()

In [155]:
f_importances(svc.coef_, X.columns[0:100])

AttributeError: 'SVC' object has no attribute 'dual_coef_'