In [63]:
import json  # For importing json files
import numpy as np
import re # For regular expression filtering of categories
import matplotlib.pyplot as plt

# SK-learn libraries for feature extraction from text.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer  # If we wanted to use TfIdf...probably not necessary though
from sklearn.naive_bayes import MultinomialNB

In [3]:
# Open up a file-handle for the text json file train_data.json
trainfh = open("train.json")
train_list = json.load(fp=trainfh)
testfh = open("test.json")
test_list = json.load(fp=testfh)


In [58]:
# Extract the training features (ingredients) for each id
XIngredients=np.asarray([train_row['ingredients'] for train_row in train_list])
print "Number of training records: %d"  %(XIngredients.shape)
# Note that 3 dictionaries are Extracted per line:  'cuisine', 'id', and 'ingredients'
print 'Training data keys:  %s' %train_list[0].keys()
# Extract training labels (type of cuisine) for each id
YCuisine=np.asarray([train_row['cuisine'] for train_row in train_list])
# Extract unique Cuisine categories from the train_list 
CuisineSet = set()
for i in range(YCuisine.shape[0]):
    CuisineSet.add(YCuisine[i])
# Transform CuisineCategories to a dictionary to convert cuisine labels to numeric values
CuisineList = [Cuisine for Cuisine in CuisineSet]
CuisineDict = {CuisineList[i]:i for i in range(len(CuisineList))}
print "Number of distinct cuisines is %d" %len(CuisineDict.keys())




# Shuffle training data and labels; 
shuffle = np.random.permutation(np.arange(NewX.shape[0]))
XIngredients, YCuisine = XIngredients[shuffle], YCuisine[shuffle]
# Convert YCuisine (text list) to numeric values based on CuisineDict
YNum = [CuisineDict[Key] for Key in YCuisine]


# Function to convert records of lists of lower-case ingredient elements into records of space-separated CamelCase ingredients
def CamelizeRecords(Records):
    NewX = [] # New record array
    for IngredientList in Records:
        IngredientCamelList=[] # Will store CamelCase list of ingredients for this record
        for Element in IngredientList: # Element is a single string possibly multi-word ingredient within this record
            WordList = []
            for Word in Element.split():
                Word = Word.capitalize()
                WordList.append(Word)
            # Finally, collapse all words in ElementList
            CamelWord = ''.join(WordList) # join Element words into single CamelCase word
            IngredientCamelList.append(CamelWord) # Add CamelWord to this record ingredient list
        NewX.append(' '.join(IngredientCamelList)) # Append single text field of CamelCase ingredients as new record
    return np.asarray(NewX)

# Reformat training data into CamelCase
TrainX = CamelizeRecords(XIngredients)

# Separate out training data/labels into 33000 training and 6774 "hold-out" dev data/labels
train_data, train_classes = TrainX[:33000], YNum[:33000]
dev_data, dev_classes = TrainX[33001:], YNum[33001:]

print train_data[1]
print CuisineList[YNum[1]]
 
    
    

# Create features, labels for test_data
XTestIngredients = np.asarray([test_row['ingredients'] for test_row in test_list])
print "Number of test records: %d" %XTestIngredients.shape
# Note that test data has no 'cuisine' (no labels...)
print  'Test data keys:  %s' %test_list[0].keys()
# Convert test data to CamelCase text strings as was done for training data above
test_data = CamelizeRecords(XTestIngredients)


Number of training records: 39774
Training data keys:  [u'cuisine', u'id', u'ingredients']
Number of distinct cuisines is 20
SpringOnions SaltedPeanuts Coriander LimeJuice RiceNoodles ChiliSauce FishSauce MuscovadoSugar CayennePepper TigerPrawn Lime VegetableOil Beansprouts
thai
Number of test records: 9944
Test data keys:  [u'id', u'ingredients']


In [64]:
# Create a VectorizedFeature Object
CountVec = CountVectorizer()
FitTransformTrain = CountVec.fit_transform(train_data)
# Try Multinomial Naive Bayes first
ClfMNB = MultinomialNB().fit(FitTransformTrain,train_classes)
# Predict Cuisine in the dev data set
#   Transform the dev data using the feature extractor from the train_data
TransformedDev = CountVec.transform(dev_data)
predicted = ClfMNB.predict(TransformedDev)
# Calculate accuracy of predictions on dev data
print "Multinomial Naive Bayes shows a prediction accuracy of %f" %(np.mean(predicted == dev_classes))

Multinomial Naive Bayes shows a prediction accuracy of 0.737339
