In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
import numpy as np
import pickle as pkl

# Replicate original paper results

# Use FastText vectors for classification

### Load the model input
This input is constructed using `construct_input_data.ipynb`

In [20]:
data = pd.read_csv('../data/food_vectors_context_only_mixed_1_full.csv',header=None)
data = data.rename(columns={0:'user',1:'date',102:'goal',103:'actual',104:'label'})

### Preprocessing and aggregation by user

We discard days where the goal or actual caloric intake was not between 1200 and 5000 calories, a reasonable range for most people

In [22]:
data = data[data['goal'] > 1200]
data = data[data['goal'] < 5000]
data = data[data['actual'] > 1200]
data = data[data['actual'] < 5000]

Following the original paper, we discard the 'on_target' label, simplifying the problem to binary classification

In [24]:
data = data[data['label'] != 'on_target']

#### Aggregate the data by user
We group the data by user, aggregate the vectors using the mean, and aggregate the users by taking the modal class. I.e. if most days that are recorded for a given user have been labeled "below", we label this user as "below"

In [25]:
groups = data.groupby('user')
grouped_y = groups['label'].agg(lambda x:x.value_counts().index[0])
grouped_x = groups.mean()
grouped = grouped_x.copy()
grouped['label'] = grouped_y

#### Display the number of users in each class
The original paper balances the number of above and below instances (approximately). We found that our results here are also approximately balanced, so we do not do anything further to correct for class imbalance

In [27]:
print("Above: " + str(sum(grouped['label'] == 'above')))
print("Below: " + str(sum(grouped['label'] == 'below')))

4175

Split the data into model input and output

In [28]:
y = grouped.loc[:,'label']
X = grouped.iloc[:,1:101]
# y = grouped.iloc[0:20,102]
# X = grouped.iloc[0:20,1:101]

#### Scoring measures for classification
Following the original paper, we score accuracy, precision, and recall

In [29]:
scoring = {'accuracy':'accuracy',
           'precision':make_scorer(precision_score,average='macro'),
           'recall':make_scorer(recall_score,average='macro')}

In [30]:
svc = SVC(kernel='linear')
scores = cross_validate(svc,X,y,scoring=scoring,cv=5)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [31]:
scores

{'fit_time': array([ 0.00200009,  0.00200009,  0.00099993,  0.00300002,  0.00199986]),
 'score_time': array([ 0.00300002,  0.00099993,  0.00300026,  0.00200009,  0.00300026]),
 'test_accuracy': array([ 0.75,  0.75,  0.75,  0.75,  0.75]),
 'test_precision': array([ 0.375,  0.75 ,  0.375,  0.375,  0.375]),
 'test_recall': array([ 0.5       ,  0.83333333,  0.5       ,  0.5       ,  0.5       ]),
 'train_accuracy': array([ 1.,  1.,  1.,  1.,  1.]),
 'train_precision': array([ 1.,  1.,  1.,  1.,  1.]),
 'train_recall': array([ 1.,  1.,  1.,  1.,  1.])}

In [32]:
for metric in ['test_accuracy','test_precision','test_recall']:
    print('\n' + metric)
    print('Average ' + metric)
    print(np.mean(scores[metric]))
    print('Std of ' + metric)
    print(np.std(scores[metric]))

0.75
0.0
0.45
0.15
0.566666666667
0.133333333333


Save the scores and model

In [62]:
pkl.dump(svc,open('../predictive_models/svm_model.pkl','wb'))
pkl.dump(scores,open('../predictive_models/svm_5_fold_scores.pkl','wb'))

## Feature importances

In [17]:
def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center')
    plt.yticks(range(len(names)), names)
    plt.show()

In [34]:
f_importances(svc.coef_, X.columns[1:101])

AttributeError: 'SVC' object has no attribute 'dual_coef_'