In [1]:
import json  # For importing json files
import numpy as np
import re # For regular expression filtering of categories
import matplotlib.pyplot as plt

# SK-learn libraries for feature extraction from text.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import *
from sklearn.feature_extraction.text import TfidfTransformer  # If we wanted to use TfIdf...probably not necessary though
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
# Open up a file-handle for the text json file train_data.json
trainfh = open('train.json')
train_list = json.load(fp=trainfh)
testfh = open('test.json')
test_list = json.load(fp=testfh)

In [3]:
print train_list[1:5]

[{u'cuisine': u'southern_us', u'id': 25693, u'ingredients': [u'plain flour', u'ground pepper', u'salt', u'tomatoes', u'ground black pepper', u'thyme', u'eggs', u'green tomatoes', u'yellow corn meal', u'milk', u'vegetable oil']}, {u'cuisine': u'filipino', u'id': 20130, u'ingredients': [u'eggs', u'pepper', u'salt', u'mayonaise', u'cooking oil', u'green chilies', u'grilled chicken breasts', u'garlic powder', u'yellow onion', u'soy sauce', u'butter', u'chicken livers']}, {u'cuisine': u'indian', u'id': 22213, u'ingredients': [u'water', u'vegetable oil', u'wheat', u'salt']}, {u'cuisine': u'indian', u'id': 13162, u'ingredients': [u'black pepper', u'shallots', u'cornflour', u'cayenne pepper', u'onions', u'garlic paste', u'milk', u'butter', u'salt', u'lemon juice', u'water', u'chili powder', u'passata', u'oil', u'ground cumin', u'boneless chicken skinless thigh', u'garam masala', u'double cream', u'natural yogurt', u'bay leaf']}]


In [4]:
# Extract the training features (ingredients) for each id
XIngredients=np.asarray([train_row['ingredients'] for train_row in train_list])
print XIngredients[1:3]
print "Number of training records: %d"  %(XIngredients.shape)
# Note that 3 dictionaries are Extracted per line:  'cuisine', 'id', and 'ingredients'
print 'Training data keys:  %s' %train_list[0].keys()
# Extract training labels (type of cuisine) for each id
YCuisine=np.asarray([train_row['cuisine'] for train_row in train_list])
# Extract unique Cuisine categories from the train_list 
CuisineSet = set()
for i in range(YCuisine.shape[0]):
    CuisineSet.add(YCuisine[i])
# Transform CuisineCategories to a dictionary to convert cuisine labels to numeric values
CuisineList = [Cuisine for Cuisine in CuisineSet]
CuisineDict = {CuisineList[i]:i for i in range(len(CuisineList))}
num_classes = len(CuisineDict.keys())
print "Number of distinct cuisines is %d" %len(CuisineDict.keys())




# Shuffle training data and labels; 
shuffle = np.random.permutation(np.arange(XIngredients.shape[0]))
XIngredients, YCuisine = XIngredients[shuffle], YCuisine[shuffle]
# Convert YCuisine (text list) to numeric values based on CuisineDict
YNum = [CuisineDict[Key] for Key in YCuisine]


# Function to convert records of lists of lower-case ingredient elements into records of space-separated CamelCase ingredients
def CamelizeRecords(Records):
    NewX = [] # New record array
    for IngredientList in Records:
        IngredientCamelList=[] # Will store CamelCase list of ingredients for this record
        for Element in IngredientList: # Element is a single string possibly multi-word ingredient within this record
            WordList = []
            for Word in Element.split():
                Word = Word.capitalize()
                WordList.append(Word)
            # Finally, collapse all words in ElementList
            CamelWord = ''.join(WordList) # join Element words into single CamelCase word
            IngredientCamelList.append(CamelWord) # Add CamelWord to this record ingredient list
        NewX.append(' '.join(IngredientCamelList)) # Append single text field of CamelCase ingredients as new record
    return np.asarray(NewX)

# Reformat training data into CamelCase
TrainX = CamelizeRecords(XIngredients)

# Separate out training data/labels into 33000 training and 6774 "hold-out" dev data/labels
train_data, train_classes = TrainX[:33000], YNum[:33000]
dev_data, dev_classes = TrainX[33001:], YNum[33001:]

print train_data[1]
print CuisineList[YNum[1]]
 
    
    

# Create features, labels for test_data
XTestIngredients = np.asarray([test_row['ingredients'] for test_row in test_list])
print "Number of test records: %d" %XTestIngredients.shape
# Note that test data has no 'cuisine' (no labels...)
print  'Test data keys:  %s' %test_list[0].keys()
# Convert test data to CamelCase text strings as was done for training data above
test_data = CamelizeRecords(XTestIngredients)

[ [u'plain flour', u'ground pepper', u'salt', u'tomatoes', u'ground black pepper', u'thyme', u'eggs', u'green tomatoes', u'yellow corn meal', u'milk', u'vegetable oil']
 [u'eggs', u'pepper', u'salt', u'mayonaise', u'cooking oil', u'green chilies', u'grilled chicken breasts', u'garlic powder', u'yellow onion', u'soy sauce', u'butter', u'chicken livers']]
Number of training records: 39774
Training data keys:  [u'cuisine', u'id', u'ingredients']
Number of distinct cuisines is 20
WhitePepper WhippingCream Nutmeg Flour FrozenCorn Sugar Butter CayennePepper Milk Salt
british
Number of test records: 9944
Test data keys:  [u'id', u'ingredients']


In [5]:
# Create a VectorizedFeature Object
max_dfs = [.05,.07,.1,.2,.25,.3,.35,.4]
for i in max_dfs:
    CountVec = CountVectorizer(max_df=i)
    FitTransformTrain = CountVec.fit_transform(train_data)
    print 'witn max_df = ', i,' the number of word in vocab is ',FitTransformTrain.shape[1]
    print FitTransformTrain[1:10]
    # Try Multinomial Naive Bayes first
    ClfMNB = MultinomialNB().fit(FitTransformTrain,train_classes)
    # Predict Cuisine in the dev data set
    #   Transform the dev data using the feature extractor from the train_data
    TransformedDev = CountVec.transform(dev_data)
    predicted = ClfMNB.predict(TransformedDev)
    # Calculate accuracy of predictions on dev data
    print "Multinomial Naive Bayes shows a prediction accuracy of %f" %(np.mean(predicted == dev_classes))


witn max_df =  0.05  the number of word in vocab is  6434
  (0, 6241)	1
  (0, 6200)	1
  (0, 3983)	1
  (0, 2228)	1
  (0, 2411)	1
  (0, 923)	1
  (1, 1772)	1
  (1, 5510)	1
  (1, 3510)	1
  (1, 2082)	1
  (1, 1164)	1
  (2, 1389)	1
  (2, 78)	1
  (2, 5195)	1
  (2, 4666)	1
  (2, 2873)	1
  (3, 3414)	1
  (3, 6045)	1
  (3, 5850)	1
  (3, 276)	1
  (3, 2336)	1
  (4, 1763)	1
  (4, 5219)	1
  (4, 5001)	1
  (4, 217)	1
  :	:
  (7, 3042)	1
  (7, 1944)	1
  (7, 4554)	1
  (7, 1838)	1
  (7, 6103)	1
  (7, 4045)	1
  (7, 2172)	1
  (7, 275)	1
  (7, 2756)	1
  (7, 2757)	1
  (7, 6282)	1
  (7, 5125)	1
  (7, 5655)	1
  (8, 1274)	1
  (8, 5153)	1
  (8, 4234)	1
  (8, 3331)	1
  (8, 1985)	1
  (8, 5143)	1
  (8, 2339)	1
  (8, 3119)	1
  (8, 5800)	1
  (8, 2327)	1
  (8, 6172)	1
  (8, 1007)	1
Multinomial Naive Bayes shows a prediction accuracy of 0.745017
witn max_df =  0.07  the number of word in vocab is  6441
  (0, 6248)	1
  (0, 6207)	1
  (0, 3988)	1
  (0, 2231)	1
  (0, 2414)	1
  (0, 924)	1
  (0, 3769)	1
  (1, 1774)	1
  (1, 551

In [6]:
CountVec = CountVectorizer(max_df=.1)
FitTransformTrain = CountVec.fit_transform(train_data)
class_priors=(np.ones(num_classes))/num_classes
ClfMNB= MultinomialNB().fit(FitTransformTrain,train_classes)
TransformedDev = CountVec.transform(dev_data)
predicted = ClfMNB.predict(TransformedDev)
print "Multinomial Naive Bayes shows a prediction accuracy of %f" %(np.mean(predicted == dev_classes))

norm_factor= [100,1000,1500,2000,2500,3000]
for norm in norm_factor:
    class_priors= (ClfMNB.class_count_+norm)/sum(ClfMNB.class_count_+norm)
    ClfMNB_prior = MultinomialNB(class_prior = class_priors).fit(FitTransformTrain,train_classes)
    predicted_prior = ClfMNB_prior.predict(TransformedDev)
    print 'Multinomial Naive Bayes with prior norm = ', norm,'is', (np.mean(predicted_prior == dev_classes))


    

Multinomial Naive Bayes shows a prediction accuracy of 0.746050
Multinomial Naive Bayes with prior norm =  100 is 0.747969880408
Multinomial Naive Bayes with prior norm =  1000 is 0.752399232246
Multinomial Naive Bayes with prior norm =  1500 is 0.752842167429
Multinomial Naive Bayes with prior norm =  2000 is 0.753580392736
Multinomial Naive Bayes with prior norm =  2500 is 0.753875682858
Multinomial Naive Bayes with prior norm =  3000 is 0.753432747675


In [7]:
#top 3 ingredients
coef= ClfMNB.coef_
print coef.shape
for i,line in enumerate (coef):
    print line.shape
    #for j,word in enumerate (line):
        #print 'for word ',CountVec.get_feature_names()[j] ,' the coef is ', word
    print 'for cuisine: ',CuisineList[i]
    topI = np.argsort(line)[-4:]
    for word in topI:
        print CountVec.get_feature_names()[word] 
    print '--------------------'

(20, 6428)
(6428,)
for cuisine:  irish
milk
bakingpowder
potatoes
bakingsoda
--------------------
(6428,)
for cuisine:  mexican
sourcream
jalapenochilies
chilipowder
groundcumin
--------------------
(6428,)
for cuisine:  chinese
greenonions
sesameoil
cornstarch
soysauce
--------------------
(6428,)
for cuisine:  filipino
cookingoil
carrots
oil
soysauce
--------------------
(6428,)
for cuisine:  vietnamese
soysauce
shallots
carrots
fishsauce
--------------------
(6428,)
for cuisine:  moroccan
carrots
groundginger
groundcinnamon
groundcumin
--------------------
(6428,)
for cuisine:  brazilian
tomatoes
sweetenedcondensedmilk
cachaca
lime
--------------------
(6428,)
for cuisine:  japanese
ricevinegar
sake
mirin
soysauce
--------------------
(6428,)
for cuisine:  british
bakingpowder
unsaltedbutter
eggs
milk
--------------------
(6428,)
for cuisine:  greek
extra
virginoliveoil
fetacheesecrumbles
driedoregano
--------------------
(6428,)
for cuisine:  indian
groundcumin
cuminseed
groundturm

In [8]:
alpha = [1,2,3,4,5]
for a in alpha:
    ClfMNBalpha = MultinomialNB(alpha=a).fit(FitTransformTrain,train_classes)
    # Predict Cuisine in the dev data set
    # Transform the dev data using the feature extractor from the train_data
    predictedalpha = ClfMNBalpha.predict(TransformedDev)

    print 'With alpha =',a,' prediction accuracy of %f' %(np.mean(predictedalpha == dev_classes))

With alpha = 1  prediction accuracy of 0.749299
With alpha = 2  prediction accuracy of 0.720656
With alpha = 3  prediction accuracy of 0.699985
With alpha = 4  prediction accuracy of 0.683449
With alpha = 5  prediction accuracy of 0.669127


In [196]:
#trying TfidVectorizer
CountVecT = TfidfVectorizer()
FitTransformTrainT = CountVecT.fit_transform(train_data)
print FitTransformTrainT.shape
print FitTransformTrainT[:,1]
print type(FitTransformTrainT[:,1:3])
# Try Multinomial Naive Bayes first
ClfMNBT = MultinomialNB(class_prior = class_priors).fit(FitTransformTrainT,train_classes)
# Predict Cuisine in the dev data set
#   Transform the dev data using the feature extractor from the train_data
TransformedDevT = CountVecT.transform(dev_data)
predictedT = ClfMNBT.predict(TransformedDevT)
# Calculate accuracy of predictions on dev data
print "Multinomial Naive Bayes shows a prediction accuracy of %f" %(np.mean(predictedT == dev_classes))

(33000, 6473)
  (8363, 0)	0.383313153129
  (11949, 0)	0.472338254388
<class 'scipy.sparse.csr.csr_matrix'>
Multinomial Naive Bayes shows a prediction accuracy of 0.710616


In [164]:
cm = confusion_matrix(dev_classes,predicted)
print cm
for i in range(num_classes):
    print cm[i]
    sum_of_cuisine= np.sum(cm[i])
    for j in range (num_classes):
        print ' the pct of ', CuisineList[i],'prediced as ', CuisineList[j],' is ',"%.2f"%(int(cm[i,j])/float(sum_of_cuisine))
        

[[  23    0    0    1    0    0    0    0    2    2    2    0   20    1
     0    1    1   54    1    9]
 [   1  983    4    1    0    1    0    1    1    2    9    0    7    5
     0    6    2   44    0   17]
 [   0    1  454    1    4    0    0    8    4    1    3    0    3    0
     0    1   13    9    1    3]
 [   0    7   32   53    0    0    1    0    1    0    1    0    0    1
     0    4    6   14    1    5]
 [   0    3   22    4   55    0    2    3    0    0    1    0    0    0
     0    0   50    1    1    1]
 [   0    3    0    0    1  105    0    0    1    1   18    0    4    2
     0    2    0    2    0    9]
 [   0   14    0    1    0    0   35    0    1    0    4    0    4    0
     1    2    4   12    0    7]
 [   1    2   42    0    1    2    0  154    0    1   24    0    5    0
     1    1    6    9    3    5]
 [   4    0    0    2    0    0    1    0   42    2    2    0   24    0
     0    1    1   54    0    8]
 [   0    3    0    1    0    4    1    0    0  111    

In [182]:

plt.hist (dev_classes,20)
plt.show()

In [166]:
print ClfMNB.class_count_

[  650.  5453.  2267.   729.   782.   773.   482.  1266.   763.  1079.
  2595.   538.  2315.   939.   500.  1382.  1373.  3692.   792.  6630.]


In [146]:
print classification_report(dev_classes,predicted)
print classification_report(dev_classes,predicted_prior)

             precision    recall  f1-score   support

          0       0.77      0.20      0.31       117
          1       0.89      0.91      0.90      1084
          2       0.72      0.90      0.80       506
          3       0.76      0.42      0.54       126
          4       0.82      0.38      0.52       143
          5       0.89      0.71      0.79       148
          6       0.83      0.41      0.55        85
          7       0.88      0.60      0.71       257
          8       0.55      0.30      0.39       141
          9       0.83      0.57      0.67       196
         10       0.83      0.93      0.88       508
         11       0.97      0.42      0.59        88
         12       0.52      0.58      0.55       431
         13       0.64      0.41      0.50       150
         14       0.87      0.30      0.45        89
         15       0.67      0.74      0.71       264
         16       0.68      0.82      0.74       266
         17       0.57      0.80      0.67   

In [167]:
predicted_prob = ClfMNB_prior.predict_proba(TransformedDev)
print prd

[  2.80905644e-13   2.28603792e-12   9.91944510e-01   3.66048987e-07
   2.41263283e-05   2.63344292e-13   2.78841419e-14   7.57051045e-03
   2.05474277e-13   3.04380992e-13   3.10675157e-11   8.78044284e-11
   1.00099154e-13   2.71275744e-13   2.60634216e-14   1.31316329e-12
   1.60607521e-04   1.04464742e-12   2.99879502e-04   7.68476909e-14]


In [181]:
threshold = .5
error_counter_above=0
error_counter=0
counter=0
counter_above=0
for i,label in enumerate(dev_classes):
    counter+=1
    if np.sort(predicted_prob[i])[-1:]>threshold:
        counter_above+=1
    if dev_classes[i]!=predicted_prior[i]:
        error_counter+=1
        print 'projected: ',predicted_prior[i], ' actual:', label
        print np.argsort(predicted_prob[i])[-4:]
        print  np.sort(predicted_prob[i])[-4:]
        if np.sort(predicted_prob[i])[-1:]>threshold:
            error_counter_above+=1
print error_counter_above/float(error_counter)
print error_counter
print counter_above/float(counter)
print counter
        


projected:  17  actual: 7
[ 7 12  8 17]
[ 0.00475175  0.01361999  0.01401146  0.96179465]
projected:  17  actual: 1
[ 5 12  8 17]
[  1.51594510e-04   1.66219590e-02   3.69448868e-02   9.46076570e-01]
projected:  17  actual: 18
[10  7 11 17]
[ 0.04614633  0.05610072  0.15020024  0.45301291]
projected:  4  actual: 16
[ 7 16  2  4]
[ 0.00400314  0.02128965  0.15809081  0.8164027 ]
projected:  15  actual: 12
[17 13 12 15]
[ 0.01148036  0.02145913  0.39677501  0.5489871 ]
projected:  15  actual: 17
[ 6  1 17 15]
[  4.90201308e-09   2.33229764e-06   9.91572508e-03   9.90081935e-01]
projected:  19  actual: 12
[ 7 15 12 19]
[ 0.12559869  0.16601526  0.22397453  0.27761368]
projected:  17  actual: 12
[19 12  8 17]
[ 0.00281368  0.10004053  0.14075872  0.75382382]
projected:  1  actual: 12
[12 17  0  1]
[ 0.04188332  0.0736894   0.11407935  0.72057294]
projected:  19  actual: 12
[ 9 17 12 19]
[ 0.07301994  0.0979371   0.29595777  0.3981    ]
projected:  8  actual: 6
[19 12 17  8]
[ 0.05691834  0