In [6]:
import numpy as np
import scipy as sp
import nltk
import pandas as pd
import json

In [7]:
#Load the dataset
with open('../train.json', 'rU') as train_data_file:
    train_data = json.load(train_data_file)

In [8]:
train_data[0]['ingredients']

[u'romaine lettuce',
 u'black olives',
 u'grape tomatoes',
 u'garlic',
 u'pepper',
 u'purple onion',
 u'seasoning',
 u'garbanzo beans',
 u'feta cheese crumbles']

In [9]:
#Break train_data into features which are ingredients, labels which are cuisine and id.
train_words = []
train_labels = []
train_ids = []
for data in train_data:
    train_words.append(';'.join(data['ingredients']))
    train_labels.append(data['cuisine'])
    train_ids.append(data['id'])

In [10]:
train_words[1]

u'plain flour;ground pepper;salt;tomatoes;ground black pepper;thyme;eggs;green tomatoes;yellow corn meal;milk;vegetable oil'

In [11]:
from nltk.corpus import stopwords
import re
import string
#Function to clean the data
def clean_ingredients(raw_text):
    
    raw_text = filter(lambda x: x in string.printable, raw_text)
    
    #remove non letters
    letters_only = re.sub("[^a-zA-Z\;]"," ", raw_text)
    
    words = letters_only.lower().split(";")
    
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    
    clean_text =  ";".join(meaningful_words)
    return ' '.join(clean_text.split())
    
    

In [12]:
clean_ingredients("(    oz.) tomato sauce;(   oz.) tomato paste;1% low-fat buttermilk;a\xe7ai powder")

'oz tomato sauce; oz tomato paste; low fat buttermilk;aai powder'

In [13]:
clean_train_words = []
for line in train_words:
    clean_train_words.append(clean_ingredients(line))

In [14]:
custom_tokenize = lambda x: x.split(';')
custom_tokenize(clean_train_words[1])

[u'plain flour',
 u'ground pepper',
 u'salt',
 u'tomatoes',
 u'ground black pepper',
 u'thyme',
 u'eggs',
 u'green tomatoes',
 u'yellow corn meal',
 u'milk',
 u'vegetable oil']

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(tokenizer=custom_tokenize)

In [31]:
train_X = vec.fit_transform(clean_train_words)

In [32]:
train_X = train_X.toarray()

In [33]:
train_X.shape

(39774, 6696)

In [19]:
vocab = vec.get_feature_names()
vocab

[u' lean ground beef',
 u' less sodium chicken broth',
 u' less sodium ham',
 u' less sodium taco seasoning',
 u' less sodium taco seasoning mix',
 u' low fat buttermilk',
 u' low fat chocolate milk',
 u' low fat cottage cheese',
 u' low fat milk',
 u' lowfat greek yogurt',
 u' oz diced tomatoes',
 u' oz frozen chopped spinach',
 u' oz frozen chopped spinach thawed and squeezed dry',
 u' oz refried beans',
 u' oz sweetened condensed milk',
 u' oz tomato paste',
 u' oz tomato sauce',
 u' reduced fat milk',
 u' to lb chicken cut into serving pieces',
 u' up',
 u'aai',
 u'aai powder',
 u'abalone',
 u'abura age',
 u'accent',
 u'accent seasoning',
 u'achiote',
 u'achiote paste',
 u'achiote powder',
 u'acini di pepe',
 u'ackee',
 u'acorn squash',
 u'active dry yeast',
 u'adobo',
 u'adobo sauce',
 u'adobo seasoning',
 u'adzuki beans',
 u'agar',
 u'agave nectar',
 u'aged balsamic vinegar',
 u'aged cheddar cheese',
 u'aged manchego cheese',
 u'ahi',
 u'ahi tuna steaks',
 u'aioli',
 u'ajinomoto'

In [34]:
set_labels = set(train_labels)
reverse_class_map = {w : i for w,i in enumerate(set_labels)}
class_map = {w : i for i, w in enumerate(set_labels)}

In [35]:
train_y = [class_map[c] for c in train_labels]

In [36]:
train_y[:10]

[9, 17, 3, 10, 10, 11, 13, 19, 1, 19]

In [37]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="hinge", penalty='l2', alpha=1e-3, n_iter=10, random_state=42)
clf.fit(train_X, train_y)

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(train_X, train_y)

In [38]:
with open('../test.json') as test_data_file:
    test_data = json.load(test_data_file)

test_words = []
test_ids = []
for data in test_data:
    test_words.append(';'.join(data['ingredients']))
    test_ids.append(data['id'])

In [39]:
test_X = vec.transform(test_words)

In [40]:
test_X.shape

(9944, 6696)

In [41]:
test_y = clf.predict(test_X)

In [42]:
test_y_cuisine = [reverse_class_map[s] for s in test_y]

In [43]:
output = pd.DataFrame( data={"id":test_ids, "cuisine":test_y_cuisine} )
output.to_csv( "SGDClassifier1.csv", index=False )