In [2]:
import numpy as np
import pandas as pd
import re

import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.svm import SVR, SVC
from sklearn.feature_selection import SelectPercentile, SelectKBest, VarianceThreshold
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize

In [3]:
raw_data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/epi_r.csv')

In [4]:
labels = []
for seq in  [re.split(r'\W|\s', label) for label in raw_data.columns]:
    labels.append('_'.join([s for s in seq if s.isalpha()]))
raw_data.columns = labels    
      

In [5]:
raw_data.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,cakeweek,wasteless,minute_meals,ingredient_recipes,...,yellow_squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack_week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Count nulls 
null_count = raw_data.isnull().sum()
null_count[null_count>0]

calories    4117
protein     4162
fat         4183
sodium      4119
dtype: int64

In [7]:
target = raw_data.rating.unique()
target.sort()
target

array([0.   , 1.25 , 1.875, 2.5  , 3.125, 3.75 , 4.375, 5.   ])

In [10]:
raw_data.rating.describe()

count    20052.000000
mean         3.714467
std          1.340829
min          0.000000
25%          3.750000
50%          4.375000
75%          4.375000
max          5.000000
Name: rating, dtype: float64

#### Notes

there are 4117 rows where all four values [calories, protein, fat, sodium] are null



In [8]:
drops = ['title', 'rating', 'calories', 'protein', 'fat', 'sodium']
X = raw_data.drop(drops, axis=1)


In [9]:
raw_data.rating.unique()

array([2.5  , 4.375, 3.75 , 5.   , 3.125, 1.875, 0.   , 1.25 ])

In [11]:
Y = raw_data.rating.apply(lambda x : {k:v for (k,v) in list(zip(sorted(list(raw_data.rating.unique())), 'ABCDEFGH'))}[x])
Y_mcs = label_binarize(Y, list(range(8)))
Y_bny = np.where(raw_data.rating >= 3.5, 1,0 )

In [12]:
ftr_vth = VarianceThreshold(threshold=0.0)
ftr_skb = SelectKBest( k=10).fit(X, Y)
clf_svc = SVC(gamma = 'auto')

clf = Pipeline([('vth', ftr_vth), ('skb', ftr_skb), ('svc', clf_svc)])
ovr_clf = OneVsRestClassifier(clf)

In [16]:
X_trn_bny, X_tst_bny, y_trn_bny, y_tst_bny = train_test_split(X, Y_bny, test_size=0.2)
#X_trn_mcs, X_tst_mcs, y_trn_mcs, y_tst_mcs = train_test_split(X, Y_mcs, test_size=0.2)

In [17]:
clf.set_params(skb__k = 10).fit(X_trn_bny, y_trn_bny)
clf.score(X_tst_bny, y_tst_bny)

0.8070306656694092

In [15]:
ovr_clf.fit(X_trn_mcs, y_trn_mcs)
ovr_clf.score(X_tst_mcs, y_tst_mcs)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


1.0