In [36]:
import numpy as np
import csv
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from scipy import interp
from  matplotlib import pyplot as plt
from sklearn import cross_validation
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn import metrics, preprocessing
import pickle
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

%matplotlib inline

In [2]:
##Read the data
df = pd.read_json("~/Documents/AML/HW2/train.json")

In [3]:
df.head(5)

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


### b)

In [6]:
## The number of recipes per cuisine genre are represented below by aggregating the recipes  per cuisine
numCuisineInst = df.groupby(['cuisine']).agg(['count'])
numCuisineInst 

Unnamed: 0_level_0,id,ingredients
Unnamed: 0_level_1,count,count
cuisine,Unnamed: 1_level_2,Unnamed: 2_level_2
brazilian,467,467
british,804,804
cajun_creole,1546,1546
chinese,2673,2673
filipino,755,755
french,2646,2646
greek,1175,1175
indian,3003,3003
irish,667,667
italian,7838,7838


In [9]:
## The number of categories is 20
len(numCuisineInst)

20

In [10]:
### Here we find that the number of rows is 39,774
ingredients_list = df['ingredients'].as_matrix()
len(ingredients_list)

39774

In [12]:
### Here we loop through every ingredient, and create a dictionary 
## of the ingredients to get only the unique ingredients
ingredientDict = dict()
for item in ingredients_list:
    for ingredient in item:
        if ingredient in ingredientDict:
            ingredientDict[ingredient] +=1
        else:
            ingredientDict[ingredient] =1
len(ingredientDict)

6714

The Dataset consists of 39,774 rows of data, with 20 different categories of cuisines. However, we find that there is quite a bit of redundancy in the data, with only 6714 differnet unique ingredients. If we examine the ingredients even further, we see that the ingredients suffer from casing and phrasing issues. For example, there might be 'Romaine' and 'romaine', and phrases like "Kraft Zesty Italian Dressing." So, go back and clean the ingredients so as to break apart any phrases into component words. 'Romaine Lettuce' becomes 'romaine', 'lettuce.' We also convert everything to lowercase, and drop any words that are 'a', 'the', 'of.' We then create a new dictionary from this and we find the number of unique ingredients to be about half as large at 3186

In [13]:
def clean_data(matrix):
    """ A function that accepts the matrix of ingredients as input. It loops through, splits any phrases into component
        parts, and adds these to a new array. Also added to this array is the ingredient converted to lower case. 
        Anything that has 'a', 'the', 'and', 'of' is dropped. Then, we add this array to a new matrix. This returns
        a matrix of lowercase, split, ingredients.
    """
    new_list = []
    for array in matrix:
        row = []
        for ingr in array:
            if ' ' in ingr:
#                 print(ingr)
                new_ingr_split = ingr.split()
#                 print(new_ingr_split)
                for item in new_ingr_split:
                    if item != 'the' and ingr != 'of' and ingr != 'and' and ingr != 'a':
                        row.append(item.lower())
            else:
                row.append(ingr.lower())
        new_list.append(row)
    return new_list

In [14]:
new_matrix = clean_data(ingredients_list)

In [15]:
## We Create a new dictionary from this cleaned data
ingr_Dict = dict()
for item in new_matrix:
    for ingredient in item:
        if ingredient in ingr_Dict:
            ingr_Dict[ingredient] +=1
        else:
            ingr_Dict[ingredient] =1

In [16]:
## the Number of unique ingredients in our cleaned data set
len(ingr_Dict)

3186

## C)

Next, we need to one hot encode the ingredients. To do this, we first convert all of the ingredients to numerical representations. Then we represent each of these numbers a binary vector.

In [17]:
## first, we create a dictionary of each item in the dataset, and map each ingredient as a key to a numerical value.
factorize_dict = dict()
k=0
for item in new_matrix:
    for ingredient in item:
        if ingredient not in factorize_dict:
            k+=1
            factorize_dict[ingredient] =k

In [19]:
#The lengths of the dictionary with frequency counts and numerical encoudings are the same
len(factorize_dict)

3186

In [20]:
## Now we go through the entire  matrix of ingredients, and replace each ingredient with the numerical value 
## In the dictionary we created above
factorized_list = []
for array in new_matrix:
    row = []
    for x in array:
        num = factorize_dict[x]
        row.append(num)
    factorized_list.append(row)

In [21]:
factorized_consolidated_list = list(factorize_dict.values())

In [22]:
## next we create an empty numpy matrix of zeroes. We will use this later to set specific index locations to 1
blank_data = np.empty((len(factorized_list), len(factorized_consolidated_list)+1))

In [23]:
##Now we loop through the entire factorized ingredient matrix represented with numerical values
## and set that index location in the corresponding row to 1
s = 0
while s < len(factorized_list):
    for item in factorized_list[s]:
        blank_data[s][item] = 1
    s+=1

In [25]:
##We also need to factorize the cuisines. Thankfully, pandas has a built in function
df['cuisine'] = pd.factorize(df['cuisine'])[0]

## D

In [26]:
##Now, we set our X = to the one encoded matrix of ingredients,
## and Y Values = to the cuisine labels represented as numbers
X= blank_data
Y = df.iloc[:,0].values

In [28]:
## We make sure the x and y shapes are the compatible
print ("X shape is: ", X.shape, "Y shape is" , Y.shape, )

X shape is:  (39774, 3187) Y shape is (39774,)


In [29]:
##make sure there are no nan values
np.any(np.isnan(X)), np.any(np.isnan(Y))

(False, False)

In [30]:
## Next we check that our one hot encoding worked, and that we have 1 values in our empty matrix
sum = 0
for item in X:
    for num in item: 
        sum = sum + num

In [31]:
print(sum)

752205.0


## E)

In [37]:
##Gaussian Naive Bayes
gnb = GaussianNB()
scores_gnb = cross_validation.cross_val_score(gnb, X, Y, cv=3)
scores_gnb.mean()

0.26502153877819484

In [38]:
##Bernouli Naive Bayes
bnb = BernoulliNB()
scores_bnb = cross_validation.cross_val_score(bnb, X, Y, cv=3)
scores_bnb.mean()

0.71317996873139122

If we look at naive Bayes, there is a drastic difference in performance. Bernoulli NB almost performs as well as Logistic Regression with a perfomance of .71. Gaussian on the other hand performs with a very poor rate of .26. A possible explanation is that the Bernoulli distribution more closely matches the Yummly data set, and Gaussian distriutions do not do a particularly good job of modeling the data. Additionally, Bernoulli Naive Bayes by its very nature is designed to only deal with binary values. Since our data is one hot encoded, it is enginerred more for Bernoulli then Gaussian. 

## F

In [40]:
scores = cross_validation.cross_val_score(LogisticRegression(), X, Y, cv=3)
scores.mean()

0.78199308839376502

We see as expected that with 3 Fold Cross Validation, Logistic Regression performs the best with projected perfomance of .78.