In [1]:
import json  # For importing json files
import numpy as np
import re # For regular expression filtering of categories
import matplotlib.pyplot as plt

# SK-learn libraries for feature extraction from text.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import *
from sklearn.feature_extraction.text import TfidfTransformer  # If we wanted to use TfIdf...probably not necessary though
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import nltk



In [2]:
# Open up a file-handle for the text json file train_data.json
trainfh = open('train.json')
train_list = json.load(fp=trainfh)
testfh = open('test.json')
test_list = json.load(fp=testfh)

In [3]:
print train_list[1:5]

[{u'cuisine': u'southern_us', u'id': 25693, u'ingredients': [u'plain flour', u'ground pepper', u'salt', u'tomatoes', u'ground black pepper', u'thyme', u'eggs', u'green tomatoes', u'yellow corn meal', u'milk', u'vegetable oil']}, {u'cuisine': u'filipino', u'id': 20130, u'ingredients': [u'eggs', u'pepper', u'salt', u'mayonaise', u'cooking oil', u'green chilies', u'grilled chicken breasts', u'garlic powder', u'yellow onion', u'soy sauce', u'butter', u'chicken livers']}, {u'cuisine': u'indian', u'id': 22213, u'ingredients': [u'water', u'vegetable oil', u'wheat', u'salt']}, {u'cuisine': u'indian', u'id': 13162, u'ingredients': [u'black pepper', u'shallots', u'cornflour', u'cayenne pepper', u'onions', u'garlic paste', u'milk', u'butter', u'salt', u'lemon juice', u'water', u'chili powder', u'passata', u'oil', u'ground cumin', u'boneless chicken skinless thigh', u'garam masala', u'double cream', u'natural yogurt', u'bay leaf']}]


In [4]:
# Extract the training features (ingredients) for each id
XIngredients=np.asarray([train_row['ingredients'] for train_row in train_list])
print XIngredients[1:3]
print "Number of training records: %d"  %(XIngredients.shape)
# Note that 3 dictionaries are Extracted per line:  'cuisine', 'id', and 'ingredients'
print 'Training data keys:  %s' %train_list[0].keys()
# Extract training labels (type of cuisine) for each id
YCuisine=np.asarray([train_row['cuisine'] for train_row in train_list])
# Extract unique Cuisine categories from the train_list 
CuisineSet = set()
for i in range(YCuisine.shape[0]):
    CuisineSet.add(YCuisine[i])
# Transform CuisineCategories to a dictionary to convert cuisine labels to numeric values
CuisineList = [Cuisine for Cuisine in CuisineSet]
CuisineDict = {CuisineList[i]:i for i in range(len(CuisineList))}
num_classes = len(CuisineDict.keys())
print "Number of distinct cuisines is %d" %len(CuisineDict.keys())




# Shuffle training data and labels; 
shuffle = np.random.permutation(np.arange(XIngredients.shape[0]))
XIngredients, YCuisine = XIngredients[shuffle], YCuisine[shuffle]
# Convert YCuisine (text list) to numeric values based on CuisineDict
YNum = [CuisineDict[Key] for Key in YCuisine]


[ [u'plain flour', u'ground pepper', u'salt', u'tomatoes', u'ground black pepper', u'thyme', u'eggs', u'green tomatoes', u'yellow corn meal', u'milk', u'vegetable oil']
 [u'eggs', u'pepper', u'salt', u'mayonaise', u'cooking oil', u'green chilies', u'grilled chicken breasts', u'garlic powder', u'yellow onion', u'soy sauce', u'butter', u'chicken livers']]
Number of training records: 39774
Training data keys:  [u'cuisine', u'id', u'ingredients']
Number of distinct cuisines is 20


In [5]:
# Function to decide if a word is plural or not
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
def isplural(word):
    lemma = wnl.lemmatize(word, 'n')
    plural = True if word is not lemma else False
    return plural, lemma

In [6]:
isplural("beans")

(True, u'bean')

In [7]:
#adjs = ['plain', 'ground', 'cooking', 'fresh', 'large', 'crushed','shredded','roasted','cracked']
adjs = ['large'] # do not exclude "ground","kosher"



# Function to convert records of lists of lower-case ingredient elements into records of space-separated CamelCase ingredients
def CamelizeRecords(Records):
    NewX = [] # New record array
    for IngredientList in Records:
        IngredientCamelList=[] # Will store CamelCase list of ingredients for this record
        for Element in IngredientList: # Element is a single string possibly multi-word ingredient within this record
            WordList = []
            for Word in Element.split():
            
                # Change pluaral to singular
                Word = isplural(Word)[1]  
                    
                for adj in adjs:
                    if Word.lower() == adj:
                        Word = ''
                
                Word = Word.capitalize()                
                WordList.append(Word)

            # Finally, collapse all words in ElementList
            CamelWord = ''.join(WordList) # join Element words into single CamelCase word
            IngredientCamelList.append(CamelWord) # Add CamelWord to this record ingredient list
        NewX.append(' '.join(IngredientCamelList)) # Append single text field of CamelCase ingredients as new record
    return np.asarray(NewX)

In [8]:

# Reformat training data into CamelCase
TrainX = CamelizeRecords(XIngredients)

# Separate out training data/labels into 33000 training and 6774 "hold-out" dev data/labels
train_data, train_classes = TrainX[:33000], YNum[:33000]

dev_data, dev_classes = TrainX[33001:], YNum[33001:]

print train_data[1]
print CuisineList[YNum[1]]
 

# Create features, labels for test_data
XTestIngredients = np.asarray([test_row['ingredients'] for test_row in test_list])
print "Number of test records: %d" %XTestIngredients.shape
# Note that test data has no 'cuisine' (no labels...)
print  'Test data keys:  %s' %test_list[0].keys()
# Convert test data to CamelCase text strings as was done for training data above
test_data = CamelizeRecords(XTestIngredients)


NavelOrange Egg Sugar WholeMilk
french
Number of test records: 9944
Test data keys:  [u'id', u'ingredients']


In [9]:
# Create a VectorizedFeature Object
max_dfs = [.05,.07,.1,.2,.25,.3,.35,.4]
for i in max_dfs:
    CountVec = CountVectorizer(max_df=i)
    FitTransformTrain = CountVec.fit_transform(train_data)
    print 'witn max_df = ', i,' the number of word in vocab is ',FitTransformTrain.shape[1]
    print FitTransformTrain[1:10]
    # Try Multinomial Naive Bayes first
    ClfMNB = MultinomialNB().fit(FitTransformTrain,train_classes)
    # Predict Cuisine in the dev data set
    #   Transform the dev data using the feature extractor from the train_data
    TransformedDev = CountVec.transform(dev_data)
    predicted = ClfMNB.predict(TransformedDev)
    # Calculate accuracy of predictions on dev data
    print "Multinomial Naive Bayes shows a prediction accuracy of %f" %(np.mean(predicted == dev_classes))


witn max_df =  0.05  the number of word in vocab is  6408
  (0, 3875)	1
  (0, 6256)	1
  (1, 3068)	1
  (1, 5427)	1
  (1, 528)	1
  (1, 1900)	1
  (1, 1615)	1
  (2, 2674)	1
  (2, 2306)	1
  (2, 2183)	1
  (2, 1476)	1
  (2, 6066)	1
  (2, 5107)	1
  (2, 4665)	1
  (2, 1043)	1
  (2, 5124)	1
  (2, 5036)	1
  (3, 2675)	1
  (3, 4671)	1
  (3, 4593)	1
  (3, 2720)	1
  (3, 1203)	1
  (3, 2159)	1
  (3, 1264)	1
  (3, 2318)	1
  :	:
  (5, 4752)	1
  (5, 4109)	1
  (5, 2698)	1
  (5, 2334)	1
  (6, 3408)	1
  (6, 1210)	1
  (6, 2556)	1
  (6, 1614)	1
  (6, 6127)	1
  (7, 2616)	1
  (7, 2761)	1
  (7, 5127)	1
  (7, 1998)	1
  (7, 1949)	1
  (7, 2812)	2
  (8, 4105)	1
  (8, 4544)	1
  (8, 3681)	1
  (8, 294)	1
  (8, 415)	1
  (8, 4802)	1
  (8, 1942)	1
  (8, 4394)	1
  (8, 2519)	1
  (8, 4459)	1
Multinomial Naive Bayes shows a prediction accuracy of 0.737044
witn max_df =  0.07  the number of word in vocab is  6415
  (0, 3880)	1
  (0, 6263)	1
  (1, 3072)	1
  (1, 5432)	1
  (1, 1992)	1
  (1, 6127)	1
  (1, 529)	1
  (1, 1902)	1
  (1, 

In [10]:
CountVec = CountVectorizer(max_df=.1)
FitTransformTrain = CountVec.fit_transform(train_data)
class_priors=(np.ones(num_classes))/num_classes
ClfMNB= MultinomialNB().fit(FitTransformTrain,train_classes)
TransformedDev = CountVec.transform(dev_data)
predicted = ClfMNB.predict(TransformedDev)
print "Multinomial Naive Bayes shows a prediction accuracy of %f" %(np.mean(predicted == dev_classes))

norm_factor= [100,1000,1500,2000,2500,3000]
for norm in norm_factor:
    class_priors= (ClfMNB.class_count_+norm)/sum(ClfMNB.class_count_+norm)
    ClfMNB_prior = MultinomialNB(class_prior = class_priors).fit(FitTransformTrain,train_classes)
    predicted_prior = ClfMNB_prior.predict(TransformedDev)
    print 'Multinomial Naive Bayes with prior norm = ', norm,'is', (np.mean(predicted_prior == dev_classes))


    

Multinomial Naive Bayes shows a prediction accuracy of 0.738373
Multinomial Naive Bayes with prior norm =  100 is 0.740735272405
Multinomial Naive Bayes with prior norm =  1000 is 0.744426398937
Multinomial Naive Bayes with prior norm =  1500 is 0.745755204488
Multinomial Naive Bayes with prior norm =  2000 is 0.746345784734
Multinomial Naive Bayes with prior norm =  2500 is 0.746493429795
Multinomial Naive Bayes with prior norm =  3000 is 0.746788719917
