In [230]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [231]:
df = pd.read_csv('menu.csv')

In [232]:
df.head()

Unnamed: 0,Category,Item,Serving Size,Calories,Calories from Fat,Total Fat,Total Fat (% Daily Value),Saturated Fat,Saturated Fat (% Daily Value),Trans Fat,...,Carbohydrates,Carbohydrates (% Daily Value),Dietary Fiber,Dietary Fiber (% Daily Value),Sugars,Protein,Vitamin A (% Daily Value),Vitamin C (% Daily Value),Calcium (% Daily Value),Iron (% Daily Value)
0,Breakfast,Egg McMuffin,4.8 oz (136 g),300,120,13.0,20,5.0,25,0.0,...,31,10,4,17,3,17,10,0,25,15
1,Breakfast,Egg White Delight,4.8 oz (135 g),250,70,8.0,12,3.0,15,0.0,...,30,10,4,17,3,18,6,0,25,8
2,Breakfast,Sausage McMuffin,3.9 oz (111 g),370,200,23.0,35,8.0,42,0.0,...,29,10,4,17,2,14,8,0,25,10
3,Breakfast,Sausage McMuffin with Egg,5.7 oz (161 g),450,250,28.0,43,10.0,52,0.0,...,30,10,4,17,2,21,15,0,30,15
4,Breakfast,Sausage McMuffin with Egg Whites,5.7 oz (161 g),400,210,23.0,35,8.0,42,0.0,...,30,10,4,17,2,21,6,0,25,10


In [233]:
## Category will be the taget variable
df['Category'].unique()

array(['Breakfast', 'Beef & Pork', 'Chicken & Fish', 'Salads',
       'Snacks & Sides', 'Desserts', 'Beverages', 'Coffee & Tea',
       'Smoothies & Shakes'], dtype=object)

In [234]:
## Tokenize the Items

In [235]:
countv = CountVectorizer(stop_words='english')

In [236]:
tokenized_words = countv.fit_transform(df['Item'])

In [237]:
len(countv.get_feature_names()) # 143

143

In [238]:
tokenized_words = pd.DataFrame(tokenized_words.toarray())

In [239]:
len(df)

260

In [240]:
len(tokenized_words)

260

In [241]:
df = pd.concat([df, tokenized_words], axis=1)

In [242]:
df.head(100)

Unnamed: 0,Category,Item,Serving Size,Calories,Calories from Fat,Total Fat,Total Fat (% Daily Value),Saturated Fat,Saturated Fat (% Daily Value),Trans Fat,...,133,134,135,136,137,138,139,140,141,142
0,Breakfast,Egg McMuffin,4.8 oz (136 g),300,120,13.0,20,5.0,25,0.0,...,0,0,0,0,0,0,0,0,0,0
1,Breakfast,Egg White Delight,4.8 oz (135 g),250,70,8.0,12,3.0,15,0.0,...,0,0,0,0,0,0,1,0,0,0
2,Breakfast,Sausage McMuffin,3.9 oz (111 g),370,200,23.0,35,8.0,42,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Breakfast,Sausage McMuffin with Egg,5.7 oz (161 g),450,250,28.0,43,10.0,52,0.0,...,0,0,0,0,0,0,0,0,0,0
4,Breakfast,Sausage McMuffin with Egg Whites,5.7 oz (161 g),400,210,23.0,35,8.0,42,0.0,...,0,0,0,0,0,0,0,1,0,0
5,Breakfast,Steak & Egg McMuffin,6.5 oz (185 g),430,210,23.0,36,9.0,46,1.0,...,0,0,0,0,0,0,0,0,0,0
6,Breakfast,"Bacon, Egg & Cheese Biscuit (Regular Biscuit)",5.3 oz (150 g),460,230,26.0,40,13.0,65,0.0,...,0,0,0,0,0,0,0,0,0,0
7,Breakfast,"Bacon, Egg & Cheese Biscuit (Large Biscuit)",5.8 oz (164 g),520,270,30.0,47,14.0,68,0.0,...,0,0,0,0,0,0,0,0,0,0
8,Breakfast,"Bacon, Egg & Cheese Biscuit with Egg Whites (R...",5.4 oz (153 g),410,180,20.0,32,11.0,56,0.0,...,0,0,0,0,0,0,0,1,0,0
9,Breakfast,"Bacon, Egg & Cheese Biscuit with Egg Whites (L...",5.9 oz (167 g),470,220,25.0,38,12.0,59,0.0,...,0,0,0,0,0,0,0,1,0,0


In [243]:
df.drop('Item', inplace=True, axis=1)

In [244]:
df['Serving Size'][78].split(' ')

['2.3', 'oz', '(65', 'g)']

In [245]:
df['Serving Size'].tail()

255    10.1 oz (285 g)
256    13.4 oz (381 g)
257     6.7 oz (190 g)
258    14.2 oz (403 g)
259     7.1 oz (202 g)
Name: Serving Size, dtype: object

In [246]:
df['Serving Size'][99].split()[2].split('(')[1]

'38'

In [247]:
def in_grams(value):
    if  'g' in value:
        return (value.split()[2].split('(')[1])
    else:
        return (value.split()[0])
        

In [248]:
in_grams('7.1 oz (202 g)')

'202'

In [249]:
df['Serving Size'] = df['Serving Size'].apply(in_grams)

In [250]:
sc = StandardScaler()

In [251]:
df['Calories'] = sc.fit_transform(df[['Calories']])
df['Serving Size'] = sc.fit_transform(df[['Serving Size']])

In [252]:
df['Calories'].mean()

5.657867337032048e-18

In [253]:
for i in df.columns[3:23]:
    df[i] = sc.fit_transform(df[[i]])

In [254]:
lencoder = LabelEncoder()

In [255]:
len(df['Category'].unique())

9

In [256]:
df['Category'] = lencoder.fit_transform(df['Category'])

In [257]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [258]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 25)

In [259]:
rfc = RandomForestClassifier(n_estimators=1000)

In [260]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [261]:
rfc_y_pred = rfc.predict(X_test)

In [262]:
confusion_matrix(y_test, rfc_y_pred)

array([[ 0,  0,  1,  0,  0,  0,  0],
       [ 0,  5,  0,  0,  0,  0,  0],
       [ 0,  0,  7,  0,  0,  0,  0],
       [ 0,  0,  0,  4,  0,  0,  0],
       [ 0,  0,  0,  0, 14,  0,  0],
       [ 0,  0,  0,  0,  0,  7,  0],
       [ 0,  0,  0,  0,  0,  0,  1]], dtype=int64)

In [263]:
accuracy_score(y_test, rfc_y_pred)

0.97435897435897434

In [264]:
cross_val_score(rfc, X_train, y_train, cv = 10).mean()

0.93648131997674322

In [266]:
gbc = GradientBoostingClassifier()

In [267]:
gbc.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [268]:
cross_val_score(gbc, X_train, y_train, cv = 10).mean()

0.94488300601000819

In [269]:
confusion_matrix(y_test, gbc.predict(X_test))

array([[ 0,  0,  1,  0,  0,  0,  0],
       [ 0,  5,  0,  0,  0,  0,  0],
       [ 0,  0,  7,  0,  0,  0,  0],
       [ 0,  0,  1,  3,  0,  0,  0],
       [ 0,  0,  0,  0, 14,  0,  0],
       [ 0,  0,  0,  0,  0,  7,  0],
       [ 0,  0,  0,  0,  0,  0,  1]], dtype=int64)

#### ends here!