In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import svm
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('../../datasets/recipes/epi_r.csv')
df.head(3)

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
X = df[list(set(list(df.columns)) - set(['rating', 'title']))]
y = df['rating']

In [4]:
# y_class classifies high ratings (>4) and low raings (<4)
y_class = y.copy()
y_class[y_class < 4] = 0
y_class[y_class > 4] = 1
y_class.value_counts()

1.0    10738
0.0     9314
Name: rating, dtype: int64

In [11]:
# reduce feature space to 30
# drop nan columns
X_null = X.isnull().sum()
null_cols = list(X_null[X_null > 0].index)
print ("Dropping {} columns due to NaNs\n{}".format(len(null_cols), null_cols))
X_feat = X.drop(columns = null_cols)

Dropping 4 columns due to NaNs
['calories', 'sodium', 'fat', 'protein']


In [12]:
# drop heavily skewed binaries
low_means = []
for col in X_feat.columns:
    if X_feat[col].mean() < 0.1:
        low_means.append(col)
print ("Dropping {} columns due to insignificant mean".format(len(low_means)))
display(low_means)
X_feat = X_feat.drop(columns = low_means)

Dropping 648 columns due to insignificant mean


['bastille day',
 'cheese',
 'fat free',
 "valentine's day",
 'nebraska',
 'microwave',
 'kansas city',
 'colorado',
 'easter',
 'butterscotch/caramel',
 'paleo',
 'passion fruit',
 'soy',
 'quick and healthy',
 'jalapeño',
 'pizza',
 'ramadan',
 'lunar new year',
 'bran',
 'sangria',
 'aperitif',
 'oklahoma',
 'seattle',
 'potluck',
 'cranberry sauce',
 'bitters',
 'ireland',
 "new year's day",
 'waffle',
 'fish',
 'illinois',
 'cupcake',
 'sourdough',
 'beef tenderloin',
 'grill/barbecue',
 'celery',
 'parmesan',
 'frittata',
 'guava',
 'suzanne goin',
 'mushroom',
 'collard greens',
 'ground lamb',
 'lunch',
 'strawberry',
 'flaming hot summer',
 'jícama',
 'leek',
 'gin',
 'tequila',
 'salmon',
 'pumpkin',
 'fortified wine',
 'swiss cheese',
 'texas',
 'apricot',
 'advance prep required',
 'alaska',
 'houston',
 'japan',
 'pea',
 'pine nut',
 'gouda',
 'mustard greens',
 'condiment',
 'ricotta',
 'raspberry',
 'parade',
 'wisconsin',
 'broil',
 'cinco de mayo',
 'chestnut',
 'las v

In [13]:
# feature list
print ("Number of features: {}".format(len(X_feat.columns)))

Number of features: 26


# SVClassifier

In [14]:
# may take some time to run, try running once and performing other actions in other code blocks
start = time.time()
svc_model = svm.SVC()
fit = svc_model.fit(X_feat, y_class)
y_pred = svc_model.predict(X_feat)
print ("Runtime: %0.2f seconds" % (time.time() - start))

Runtime: 50.26 seconds


In [15]:
start = time.time()
svc_score = svc_model.score(X_feat, y_class)
print ("Runtime: %0.2f seconds" % (time.time() - start))
print ("Baseline Score: %0.3f" % (svc_score))

Runtime: 21.50 seconds
Baseline Score: 0.579


# Iterate and improve

In [16]:
start = time.time()
svc_iter_model = svm.SVC(C = 100)
print ("Runtime: %0.2f seconds" % (time.time() - start))

Runtime: 0.00 seconds


In [17]:
# add nutritional information, imputing nulls with median
X_nut = df[list(set(list(df.columns)) - set(['rating', 'title']))]
X_nut.drop(columns = low_means, inplace = True)
X_nut = X_nut.fillna(X_nut.median())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
start = time.time()
svc_iter_cross_val_scores = cross_val_score(svc_iter_model, X_nut, y_class, cv=5)
print ("Runtime: %0.2f seconds" % (time.time() - start))
print ("Mean Accuracy: %0.3f (+/- %0.3f)" % (svc_iter_cross_val_scores.mean(), svc_iter_cross_val_scores.std()))

Runtime: 338.94 seconds
Mean Accuracy: 0.603 (+/- 0.007)
