In [None]:
import numpy as np
import pandas as pd
import re

import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.svm import SVR, SVC
from sklearn.feature_selection import SelectPercentile, SelectKBest, VarianceThreshold
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize

In [None]:
raw_data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/epi_r.csv')

In [None]:
labels = []
for seq in  [re.split(r'\W|\s', label) for label in raw_data.columns]:
    labels.append('_'.join([s for s in seq if s.isalpha()]))
raw_data.columns = labels    
      

In [None]:
raw_data.head()

In [None]:
# Count nulls 
null_count = raw_data.isnull().sum()
null_count[null_count>0]

In [None]:
target = raw_data.rating.unique()
target.sort()
target

#### Notes

there are 4117 rows where all four values [calories, protein, fat, sodium] are null



In [None]:
drops = ['title', 'rating', 'calories', 'protein', 'fat', 'sodium']
X = raw_data.drop(drops, axis=1)


In [None]:
Y = raw_data.rating.apply(lambda x : {k:v for (k,v) in list(zip(sorted(list(raw_data.rating.unique())), 'ABCDEFGH'))}[x])
Y_mcs = label_binarize(Y, list(range(8)))
Y_bny = np.where(raw_data.rating >= 3.5, 1,0 )

In [None]:
ftr_vth = VarianceThreshold(threshold=0.0)
ftr_skb = SelectKBest( k=10).fit(X, Y)
clf_svc = SVC(gamma = 'auto')

clf = Pipeline([('vth', ftr_vth), ('skb', ftr_skb), ('svc', clf_svc)])
ovr_clf = OneVsRestClassifier(clf)

In [None]:
X_trn_bny, X_tst_bny, y_trn_bny, y_tst_bny = train_test_split(X, Y_bny, test_size=0.2)
X_trn_mcs, X_tst_mcs, y_trn_mcs, y_tst_mcs = train_test_split(X, Y_mcs, test_size=0.2)

In [None]:
clf.set_params(skb__k = 10).fit(X_train_bny, y_train_bny)
clf.score(X_test_bny, y_test_bny)

In [None]:
ovr_clf.fit(X_trn_mcs, y_trn_mcs)
ovr_clf.score(X_tst_mcs, y_tst_mcs)