In [1]:
import json  # For importing json files
import numpy as np
import re # For regular expression filtering of categories
import matplotlib.pyplot as plt
import nltk

# SK-learn libraries for feature extraction from text.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

from sklearn.feature_extraction.text import *
from sklearn.feature_extraction.text import TfidfTransformer  # If we wanted to use TfIdf...probably not necessary though
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
np.random.seed(0)

In [3]:
# Open up a file-handle for the text json file train_data.json
trainfh = open('train.json')
train_list = json.load(fp=trainfh)
testfh = open('test.json')
test_list = json.load(fp=testfh)

In [4]:
# Extract the training features (ingredients) for each id
XIngredients=np.asarray([train_row['ingredients'] for train_row in train_list])
print XIngredients[1:3]
print "Number of training records: %d"  %(XIngredients.shape)
# Note that 3 dictionaries are Extracted per line:  'cuisine', 'id', and 'ingredients'
print 'Training data keys:  %s' %train_list[0].keys()
# Extract training labels (type of cuisine) for each id
YCuisine=np.asarray([train_row['cuisine'] for train_row in train_list])
# Extract unique Cuisine categories from the train_list 
CuisineSet = set()
for i in range(YCuisine.shape[0]):
    CuisineSet.add(YCuisine[i])
# Transform CuisineCategories to a dictionary to convert cuisine labels to numeric values
CuisineList = [Cuisine for Cuisine in CuisineSet]
CuisineDict = {CuisineList[i]:i for i in range(len(CuisineList))}
num_classes = len(CuisineDict.keys())
print "Number of distinct cuisines is %d" %len(CuisineDict.keys())


# Shuffle training data and labels; 
shuffle = np.random.permutation(np.arange(XIngredients.shape[0]))
XIngredients, YCuisine = XIngredients[shuffle], YCuisine[shuffle]
# Convert YCuisine (text list) to numeric values based on CuisineDict
YNum = [CuisineDict[Key] for Key in YCuisine]

# Function to decide if a word is plural or not
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

def isplural(word):
    lemma = wnl.lemmatize(word, 'n')
    plural = True if word is not lemma else False
    return plural, lemma

adjs = ['cooking', 'large', 'cracked']

# Function to convert records of lists of lower-case ingredient elements into records of space-separated CamelCase ingredients
def CamelizeRecords(Records):
    NewX = [] # New record array
    for IngredientList in Records:
        IngredientCamelList=[] # Will store CamelCase list of ingredients for this record
        for Element in IngredientList: # Element is a single string possibly multi-word ingredient within this record
            WordList = []
            for Word in Element.split():
                Word = Word.capitalize()
                WordList.append(Word)
            # Finally, collapse all words in ElementList
            CamelWord = ''.join(WordList) # join Element words into single CamelCase word
            IngredientCamelList.append(CamelWord) # Add CamelWord to this record ingredient list
        NewX.append(' '.join(IngredientCamelList)) # Append single text field of CamelCase ingredients as new record
    return np.asarray(NewX)


# Function to convert records of lists of lower-case ingredient elements into records of space-separated CamelCase ingredients
def CamelizeRecords2(Records):
    NewX = [] # New record array
    for IngredientList in Records:
        IngredientCamelList=[] # Will store CamelCase list of ingredients for this record
        for Element in IngredientList: # Element is a single string possibly multi-word ingredient within this record
            WordList = []
            for Word in Element.split():
            
                # Change pluaral to singular
                Word = isplural(Word)[1]  
                    
                for adj in adjs:
                    if Word.lower() == adj:
                        Word = ''
                
                Word = Word.capitalize()                
                WordList.append(Word)

            # Finally, collapse all words in ElementList
            CamelWord = ''.join(WordList) # join Element words into single CamelCase word
            IngredientCamelList.append(CamelWord) # Add CamelWord to this record ingredient list
        NewX.append(' '.join(IngredientCamelList)) # Append single text field of CamelCase ingredients as new record
    return np.asarray(NewX)

# Here's another idea -- split multi-word feature into multiple single-word features
def Fission(Records):
    Limit = 5 # Limit the accepted length of words
    newX = [] # New record array
    for IngredientList in Records:
        IngredientFissionList = []
        for Element in IngredientList:
            for Word in Element.split():
                Word = Word.lower()
                # Only include words which are not digits
                #if re.match(r'jeru',Word.lower()):
                 #   print Word
                if not (re.match(r'^[\d\%]+$',Word) or re.match(r'.+ed$',Word)):
                    if len(Word) > Limit:
                        Word = Word[0:Limit]
                    IngredientFissionList.append(Word)
        newX.append(' '.join(IngredientFissionList))
    return np.asarray(newX)


# Reformat training data into CamelCase
TrainX = CamelizeRecords2(XIngredients)

# Separate out training data/labels into 33000 training and 6774 "hold-out" dev data/labels
train_data, train_classes = TrainX[:33000], YNum[:33000]
dev_data, dev_classes = TrainX[33001:], YNum[33001:]
print train_data.shape
print train_data[1]
print CuisineList[YNum[1]]
 
# Now try out Fission() for producing lower-feature number
TrainXFission = Fission(XIngredients)
# Once again Separate out training data/labels into 33000 training and 6774 "hold-out" dev data/labels
train_data2 = TrainXFission[:33000]
dev_data2 = TrainXFission[33001:]    
    

# Create features, labels for test_data
XTestIngredients = np.asarray([test_row['ingredients'] for test_row in test_list])
#print "Number of test records: %d" %XTestIngredients.shape
# Note that test data has no 'cuisine' (no labels...)
#print  'Test data keys:  %s' %test_list[0].keys()
# Convert test data to CamelCase text strings as was done for training data above
test_data = CamelizeRecords(XTestIngredients)

[ [u'plain flour', u'ground pepper', u'salt', u'tomatoes', u'ground black pepper', u'thyme', u'eggs', u'green tomatoes', u'yellow corn meal', u'milk', u'vegetable oil']
 [u'eggs', u'pepper', u'salt', u'mayonaise', u'cooking oil', u'green chilies', u'grilled chicken breasts', u'garlic powder', u'yellow onion', u'soy sauce', u'butter', u'chicken livers']]
Number of training records: 39774
Training data keys:  [u'cuisine', u'id', u'ingredients']
Number of distinct cuisines is 20
(33000,)
ParsleySprig Radish SeaSalt Pozole ChickenStock WhiteOnion Tomatillo GarlicClove CanolaOil BonelessPorkShoulder Pork ShreddedCabbage RomaineLettuceLeaf DriedOregano SerranoChilies Lime Epazote GreenPumpkinSeed
mexican


In [5]:
# Naive Bayes - Default - With CamelCasing
CountVec = CountVectorizer()
FitTransformTrain = CountVec.fit_transform(train_data)
# Try Multinomial Naive Bayes first
ClfMNB = MultinomialNB().fit(FitTransformTrain,train_classes)
# Predict Cuisine in the dev data set
#   Transform the dev data using the feature extractor from the train_data
TransformedDev = CountVec.transform(dev_data)
predicted = ClfMNB.predict(TransformedDev)
# Calculate accuracy of predictions on dev data
print "Multinomial Naive Bayes shows a prediction accuracy of %f" %(np.mean(predicted == dev_classes))

Multinomial Naive Bayes shows a prediction accuracy of 0.721837


In [6]:
#Naive Bayes- Different alpha - With CamelCasing
alpha = [.001,.01,1,2]
for a in alpha:
    ClfMNBalpha = MultinomialNB(alpha=a).fit(FitTransformTrain,train_classes)
    # Predict Cuisine in the dev data set
    # Transform the dev data using the feature extractor from the train_data
    predictedalpha = ClfMNBalpha.predict(TransformedDev)

    print 'With alpha =',a,' prediction accuracy of %f' %(np.mean(predictedalpha == dev_classes))

With alpha = 0.001  prediction accuracy of 0.727300
With alpha = 0.01  prediction accuracy of 0.738521
With alpha = 1  prediction accuracy of 0.721837
With alpha = 2  prediction accuracy of 0.689798


In [7]:
# Trying Different ,max_df
max_dfs = [.05,.07,.1,.2,.25,.3,.35,.4]
maxDF=0
highest_score=0
for i in max_dfs:
    CountVec = CountVectorizer(max_df=i)
    FitTransformTrain = CountVec.fit_transform(train_data)
    #print 'with max_df = ', i,' the number of word in vocab is ',FitTransformTrain.shape[1]
    # Try Multinomial Naive Bayes first
    ClfMNB = MultinomialNB(alpha=.01).fit(FitTransformTrain,train_classes,)
    # Predict Cuisine in the dev data set
    #   Transform the dev data using the feature extractor from the train_data
    TransformedDev = CountVec.transform(dev_data)
    predicted = ClfMNB.predict(TransformedDev)
    # Calculate accuracy of predictions on dev data
    #print "Multinomial Naive Bayes shows a prediction accuracy of %f" %(np.mean(predicted == dev_classes))
    if (np.mean(predicted == dev_classes))>highest_score:
        maxDF=i
        highest_score=(np.mean(predicted == dev_classes))
print 'highest score is',highest_score
print 'using max_df of', maxDF


highest score is 0.741030562528
using max_df of 0.1


In [8]:
#Changing the priors. - Naive Bayes with CamelCase
CountVec = CountVectorizer(max_df=maxDF)
FitTransformTrain = CountVec.fit_transform(train_data)
TransformedDev = CountVec.transform(dev_data)

#Trying different normalization factors. The larger the factor is, the closer the priors get to to flat distribution
norm_factor= [100,1000,1500,2000,2500,3000]
maxNorm=0
highest_score=0
for norm in norm_factor:
    class_priors= (ClfMNB.class_count_+norm)/sum(ClfMNB.class_count_+norm)
    ClfMNB_prior = MultinomialNB( class_prior = class_priors).fit(FitTransformTrain,train_classes)
    predicted_prior = ClfMNB_prior.predict(TransformedDev)
    print 'Multinomial Naive Bayes with prior norm = ', norm,'is', (np.mean(predicted_prior == dev_classes))
    if (np.mean(predicted == dev_classes))>highest_score:
        maxNorm=norm
        highest_score=(np.mean(predicted == dev_classes))
print 'highest score is',highest_score
print 'max Norm is',maxNorm
class_priors = (ClfMNB.class_count_+maxNorm)/sum(ClfMNB.class_count_+maxNorm)


Multinomial Naive Bayes with prior norm =  100 is 0.732467148974
Multinomial Naive Bayes with prior norm =  1000 is 0.738077661302
Multinomial Naive Bayes with prior norm =  1500 is 0.739111176731
Multinomial Naive Bayes with prior norm =  2000 is 0.737634726118
Multinomial Naive Bayes with prior norm =  2500 is 0.737044145873
Multinomial Naive Bayes with prior norm =  3000 is 0.737044145873
highest score is 0.73896353167
max Norm is 100


In [9]:
#trying TfidVectorizer
CountVecT = TfidfVectorizer(max_df=maxDF)
FitTransformTrainT = CountVecT.fit_transform(train_data)
ClfMNBT = MultinomialNB(alpha=.01).fit(FitTransformTrainT,train_classes)
# Predict Cuisine in the dev data set
# Transform the dev data using the feature extractor from the train_data
TransformedDevT = CountVecT.transform(dev_data)
predictedT = ClfMNBT.predict(TransformedDevT)
# Calculate accuracy of predictions on dev data
print "Multinomial Naive Bayes with tf-idf shows a prediction accuracy of %f" %(np.mean(predictedT == dev_classes))

Multinomial Naive Bayes with tf-idf shows a prediction accuracy of 0.742950


In [10]:
#Logistic Regression with L2 
LR = LogisticRegression()
LR.fit(FitTransformTrainT,train_classes)
predictedLR = LR.predict(TransformedDevT)
predictedLR_Train = LR.predict(FitTransformTrain)
print "Logistic Regression shows a prediction accuracy of %f" %(np.mean(predictedLR == dev_classes))
print "Logistic Regression on training data shows a prediction accuracy of %f" %(np.mean(predictedLR_Train == train_classes))

Logistic Regression shows a prediction accuracy of 0.754762
Logistic Regression on training data shows a prediction accuracy of 0.799424


In [11]:
#trying l1 - 
LR = LogisticRegression(penalty='l1')
LR.fit(FitTransformTrainT,train_classes)
predictedLR = LR.predict(TransformedDevT)

print "Logistic Regression with l1 shows a prediction accuracy of %f" %(np.mean(predictedLR == dev_classes))


Logistic Regression with l1 shows a prediction accuracy of 0.755795


In [None]:
Cvalues=[1.0,2.0,3.0,4.0,5.0,6.0]
for c in Cvalues:
    LR = LogisticRegression(penalty='l1',C=c)
    LR.fit(FitTransformTrainT,train_classes)
    predictedLR = LR.predict(TransformedDevT)
    print 'c =',c
    print "Logistic Regression shows a prediction accuracy of %f" %(np.mean(predictedLR == dev_classes))

c = 1.0
Logistic Regression shows a prediction accuracy of 0.755795
c = 2.0
Logistic Regression shows a prediction accuracy of 0.768935
c =

In [None]:
solvers=("newton-cg", "lbfgs")
for s in solvers:
    LR = LogisticRegression(penalty='l1',C=4.0,solver=s,multi_class='multinomial')
    LR.fit(FitTransformTrainT,train_classes)
    predictedLR = LR.predict(TransformedDevT)
    print 's=',s
    print "Logistic Regression with solver =",s,"shows a prediction accuracy of %f" %(np.mean(predictedLR == dev_classes))

In [None]:
#Trying SVM
sv=SVC()
sv.fit(FitTransformTrainT,train_classes)
predictedSV=sv.predict(TransformedDevT)
print "SVM shows a prediction accuracy of %f" %(np.mean(predictedSV == dev_classes))


In [None]:
#Understanding the Data

In [None]:
#How well does this model actualy predict the training data?
predictedTrain= LR.predict(FitTransformTrainT)
print classification_report(dev_classes,predictedLR)
print classification_report(train_classes,predictedTrain)
# the acuracy is capped at 89%

In [None]:
#top 4 ingredients
coef= LR.coef_
print coef.shape
for i,line in enumerate (coef):
    #for j,word in enumerate (line):
        #print 'for word ',CountVec.get_feature_names()[j] ,' the coef is ', word
    print 'for cuisine: ',CuisineList[i]
    topI = np.argsort(line)[-4:]
    for word in topI:
        print CountVecT.get_feature_names()[word] 
        #print coef[i,word]
    print '--------------------'

In [None]:
#variance of coeficient LR
LR = LogisticRegression(penalty='l1',C=4.0)
LR.fit(FitTransformTrainT,train_classes)
coefLR=LR.coef_
print coefLR.shape
NumWords = 10;
count=0
varsLR  = np.zeros(coefLR.shape[1])
for i in range(coefLR.shape[1]):
    varsLR[i]= np.var(coefLR[:,i])
    if np.var(coefLR[:,i]==0): count+=1
LowVarLR= np.argsort(varsLR)[1:NumWords+1]
HighVarLR=np.argsort(varsLR)[-NumWords:]

#plt.hist (varsLR,20)
#plt.show()
for word in LowVarLR:
    print CountVec.get_feature_names()[word] 
print'------------'
for word in HighVarLR:
    print CountVec.get_feature_names()[word] 

print '------------'


In [None]:
predicted_prob = LR.predict_proba(TransformedDevT)
threshold = .5
error_counter_above=0
error_counter=0
counter=0
counter_above=0
for i,label in enumerate(dev_classes):
    counter+=1
    if np.sort(predicted_prob[i])[-1:]>threshold:
        counter_above+=1
    if label!=predictedLR[i]:
        error_counter+=1
        ##print 'projected: ',predictedLR[i], ' actual:', label
        ##print np.argsort(predicted_prob[i])[-4:]
        ##print np.sort(predicted_prob[i])[-4:]
        if np.sort(predicted_prob[i])[-1:]>threshold:
            error_counter_above+=1
print 'selecting a threshold of ',threshold,' will select ', error_counter-error_counter_above,' errors out of',error_counter
print 'selecting a threshold of ',threshold,' will select ', counter-counter_above-(error_counter-error_counter_above),' accurate out of',counter-error_counter



In [None]:
#secondary analysis
predictedLR2=np.zeros(predictedLR.shape)
for i,label in enumerate(predictedLR):
    if np.sort(predicted_prob[i])[-1:]<threshold: #we only want to look at datadata with a score below a certain threshold to 
        top4C = np.argsort(predicted_prob[i])[-4:] #select the classes which got the top 4 probabilities
        #the indices for the training data which indicate training example with any of the 4 classes
        indices = np.asarray(np.where((train_classes==top4C[0]) | (train_classes==top4C[1]) | (train_classes==top4C[2]) | (train_classes==top4C[3])))
        train_data2=[]
        train_classes2=[]
        #new training data set and new training label set
        for j,index in enumerate (indices[0,:]):
            train_data2.append(train_data[index])
            train_classes2.append(train_classes[index])
        #new vectorizer, and LR classifier
        CountVec2 = CountVectorizer(max_df=.1) #new vectorizer
        FitTransform = CountVec2.fit_transform(train_data2)
        LR2 = LogisticRegression().fit(FitTransform,train_classes2)
        TransformedDev2 = CountVec2.transform(dev_data)
        #new prediction
        predicted2 = LR2.predict(TransformedDev2)
        #add the new prediction to the main predicition list 
        predictedLR2[i]= predicted2[i]
    else:predictedLR2[i]=predictedLR[i] #if above thrshold use the existing predicted class
print "Logistic Regression with added stage shows a prediction accuracy of %f" %(np.mean(predictedLR2 == dev_classes))