In [1]:
#!/usr/local/bin/python
import os
import json
import numpy as np
import random
#from sklearn.grid_search import GridSearchCV
#import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.feature_extraction.text import *
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectPercentile,f_classif
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import nltk
import string #use the punctuation
import re
from nltk.corpus import stopwords
import csv
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True) #Use Snowball stemmer

In [2]:
# List of stop words and adjectives that cause overfitting 
cachedStopWords = stopwords.words("english") # from nltk
adj_words = ['refrigerated','fresh','freshly']

In [3]:
# Open the Training Data file and divide into training and dev set
train_file = os.path.join(".","train.json")

with open(train_file) as data_file:
    train_data = json.loads(data_file.read())

random.shuffle(train_data)

n = 340000 #number of partitions, want all the training data so chose very high n... lazy approach I know
dev_test_label = [d["cuisine"] for d in train_data[:len(train_data)/n]]
dev_train_label = [d["cuisine"] for d in train_data[len(train_data)/n:]]

# Preliminary Text Pre-Processing , rest is done in tokenize() of TF-IDF
# data should be a list of strings
# for tokenizer to token phrases instead of words, each line in an ingredient should be followed by a comma
dev_test_data = []
for doc in train_data[:len(train_data)/n]:
    # collect all the words for that recipe
    ingredient_phrase_list = []
    for ingredient_phrase in doc["ingredients"]: #this is a line/row
        #lower case
        ingr_phrase = ingredient_phrase.lower()
        # strip digits from the word phrase
        ingr_phrase = ' '.join([word for word in ingr_phrase.split() if not word.isdigit()])
        # remove stop words from the phrase
        ingr_phrase = ' '.join([word for word in ingr_phrase.split() if word not in cachedStopWords])
        # remove other words from the phrase
        ingr_phrase = ' '.join([word for word in ingr_phrase.split() if word not in adj_words])
        # Append the ingredient phrase to join by ,
        ingredient_phrase_list.append(ingr_phrase)
     
    #join each list item with ,
    dev_test_data.append(','.join(ingredient_phrase_list))
    
dev_train_data = []
for doc in train_data[len(train_data)/n:]:
    # collect all the words for that recipe
    ingredient_phrase_list = []
    for ingredient_phrase in doc["ingredients"]:
        #lower case
        ingr_phrase = ingredient_phrase.lower()
        # strip digits from the word phrase
        ingr_phrase = ' '.join([word for word in ingr_phrase.split() if not word.isdigit()])
        # remove stop words from the phrase
        ingr_phrase = ' '.join([word for word in ingr_phrase.split() if word not in cachedStopWords])
        # remove other words from the phrase
        ingr_phrase = ' '.join([word for word in ingr_phrase.split() if word not in adj_words])
        # Append the ingredient phrase to join by ,
        ingredient_phrase_list.append(ingr_phrase)
     
    #join each list item with ,
    dev_train_data.append(','.join(ingredient_phrase_list))

    
print "Train data shape: ",np.shape(dev_train_data)
print "Dev_test data shape: ",np.shape(dev_test_data)

Train data shape:  (39774,)
Dev_test data shape:  (0,)


Using all the Train Data and will do Cross validation on random samples of this training data.. you will see below

In [4]:
# Prepare Test Data in same way as Train data, except no partitioning
# Open the Test Data file and divide into training and dev set
test_file = os.path.join(".","test.json")

with open(test_file) as data_file:
    testdata = json.loads(data_file.read())

test_data = []
test_ids = []
for doc in testdata:
    # collect the id for each test_data
    test_ids.append(doc["id"])
    # collect all the words for that recipe
    ingredient_phrase_list = []
    for ingredient_phrase in doc["ingredients"]: #this is a line/row
        #lower case
        ingr_phrase = ingredient_phrase.lower()
        # strip digits from the word phrase
        ingr_phrase = ' '.join([word for word in ingr_phrase.split() if not word.isdigit()])
        # remove stop words from the phrase
        ingr_phrase = ' '.join([word for word in ingr_phrase.split() if word not in cachedStopWords])
        # remove other words from the phrase
        ingr_phrase = ' '.join([word for word in ingr_phrase.split() if word not in adj_words])
        # Append the ingredient phrase to join by ,
        ingredient_phrase_list.append(ingr_phrase)
     
    #join each list item with ,
    test_data.append(','.join(ingredient_phrase_list))

print "Test data shape: ",np.shape(test_data)

Test data shape:  (9944,)


In [48]:
#units = ['and','or','warm','large','ground','whole', 'mince', 'shred','grate', 'slice','kosher']
units = ['oz','and','or'] # after stemming remove these stop words
#Change compound words after stemming to a simple word for standardization, this seems to reduce my score :(
replacements = {
                #'all purpos flour': 'flour',
                #'extra virgin':'',
                #'oliv oil': 'oliveoil',
                }
#The TFIDF vector tokenizes each string by comma
def tokenize(text):
    #print "text: ", text
    final_tokens = []
    tokens = [ingredient for ingredient in text.split(',')]
    # dont include empty tokens
    for phrase in tokens:  
        cleaned_phrase = phrase 
        if "hidden valley" in cleaned_phrase:
               cleaned_phrase= "hiddenvalley"

        # Replace unicode chars with letter c
        cleaned_phrase = re.sub(r'[^\x00-\x7F]+','c', cleaned_phrase)
        # Remove anything that is not alphabet
        cleaned_phrase = re.sub('[^a-z]+', ' ', cleaned_phrase) 
        # Stem each word
        cleaned_phrase = (" ").join([stemmer.stem(word) for word in cleaned_phrase.split()])
        #cleaned_phrase = (" ").join([WordNetLemmatizer().lemmatize(word) for word in cleaned_phrase.split()])
        # Remove any units from words in the phrase
        cleaned_phrase = (" ").join([word for word in cleaned_phrase.split() if word not in units])
        # remove any unneccary white space that is in front of the phrase
        cleaned_phrase = " ".join(cleaned_phrase.split())
        #print cleaned_phrase
        
        for key, value in replacements.items(): 
            cleaned_phrase = cleaned_phrase.replace(key,value)
           
        # flatten phrase
        for word in cleaned_phrase.split():
            #Replace words with correct spelling if spelt with alternative way
            word= word.replace("yoghurt","yogurt")     
            
            if len(word) >1: #dont count single letter words
                final_tokens.append(word)
    
    return final_tokens


tokenize('grates sliced cheese, baking-powder milk,   extra-  virgin olive    oil,')

[u'grate',
 u'slice',
 u'chees',
 u'bake',
 u'powder',
 u'milk',
 u'extra',
 u'virgin',
 u'oliv',
 u'oil']

In [42]:
# Fit and transform all the train data- these are tested params from grid search
vec = TfidfVectorizer(tokenizer=tokenize, max_df=.50, min_df=1, lowercase=False,ngram_range=(1,2), binary=True,
                     sublinear_tf=True)

train = vec.fit_transform(dev_train_data)
size =len(vec.get_feature_names())
print "Number of features: ", size

Number of features:  74715


In [54]:
# Feature Selection using chi square- find and keep p% best correlation between words(features)
ch2 = SelectPercentile(chi2, percentile=95) #use 95% of those features
X_train_features = ch2.fit_transform(train, dev_train_label)

In [55]:
# Fit into One vs Rest model( creates a model for each class and compares the confidence score
# among the models for each data. Classifier choses is LinearSVM model
svm= OneVsRestClassifier(LinearSVC(C=.50,)) # C is the regularization param, the large C, the smaller margin of error of the decision boundary line

In [56]:
# Calculate Accuracy on random samples of the all the training data:
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(svm, X_train_features, dev_train_label) # this creates 3 folds and fits the X_train after chi
print np.mean(scores)
print scores

0.785714639663
[ 0.78255297  0.78867185  0.78591911]


In [52]:
# REAL TEST SET that is used to score in Kaggle submission
svm.fit(X_train_features, dev_train_label) #first need to fit the svm model to all train data, pipeline doesnt work
real_test = vec.transform(test_data)
X_test_real = ch2.transform(real_test) 
real_predicted = svm.predict(X_test_real)

In [53]:
# Write TEST DATA to csv for submission
header = ['id','cuisine']
with open('saru_test_submission.csv', 'w') as fp:
    writer = csv.writer(fp, delimiter=',')
    # First write the header
    writer.writerow(header)
    for test_index in range(len(test_data)):
        writer.writerow([test_ids[test_index], real_predicted[test_index]])

In [57]:
# Print out vocab and if you want to output vocab file change is_print to 1
vocab= list(np.asarray(vec.get_feature_names())[ch2.get_support()])
print vocab[80:100]
is_print =0
if is_print == 1:
    f = open('vocab-saru.txt', 'w')
    for item in vocab:
        f.write("%s\n" % item)

[u'adobo corn', u'adobo dri', u'adobo egg', u'adobo extra', u'adobo garlic', u'adobo granul', u'adobo grate', u'adobo green', u'adobo ground', u'adobo jack', u'adobo jalapeno', u'adobo ketchup', u'adobo kidney', u'adobo knorr', u'adobo kosher', u'adobo larg', u'adobo lime', u'adobo masa', u'adobo mayonais', u'adobo oliv']
