In [None]:
import pandas as pd
recipes = pd.read_json("train.json", orient = "records", dtype = {"cuisine" : "str", "id" : "int64", "ingredients" : "str"})
recipes.head()

In [None]:
recipes.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

from nltk.tokenize.regexp import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.metrics import *

# unused
def comma_split(s):
    return s.split(",")

# unused
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
analyzer = bigram_vectorizer.build_analyzer()

# unused
def tokenize(s):
    all_elements = []
    elements = comma_split(s) 
    elements = map(lambda s: s.strip(), elements)
    all_elements += elements
    for element in elements:
        bigrams = analyzer(element)  
        all_elements += bigrams
    return all_elements

class StemTokenizer(object):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=1, token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z]+\b", strip_accents='ascii', stop_words='english')
        self.analyzer = self.vectorizer.build_analyzer()
        self.stemmer = PorterStemmer()
        self.reference_words = []
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in self.analyzer(doc)]


#vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,1), tokenizer=tokenize, stop_words='english', dtype=np.float32, max_features = 20000)
#vectorizer = TfidfVectorizer(min_df=1, analyzer='word', token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z]+\b", strip_accents='ascii', stop_words='english', dtype=np.float32, max_features = 2000)
vectorizer = TfidfVectorizer(min_df=1, tokenizer=StemTokenizer(), strip_accents='ascii', stop_words='english', dtype=np.float32, max_features = 2000)
ingredients_features = vectorizer.fit_transform(recipes["ingredients"])
ingredients_features

In [None]:
vectorizer.get_feature_names()[0:1000]

In [None]:
from pandas import DataFrame
features_df = DataFrame(vectorizer.get_feature_names())
features_df.to_csv("features_stemmed.csv")

In [None]:
from sklearn.cross_validation import train_test_split
recipes_train_data, recipes_test_data, recipes_train_result, recipes_test_result = train_test_split(ingredients_features, recipes["cuisine"], test_size=0.2, random_state=42)

In [None]:
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=500)
clf = clf.fit(recipes_train_data.toarray(), recipes_train_result)

In [None]:
from sklearn import cross_validation
from sklearn import metrics
predicted = clf.predict(recipes_test_data.toarray()) 
metrics.accuracy_score(recipes_test_result, predicted) 

In [None]:
metrics.confusion_matrix(recipes_test_result, predicted) 

In [None]:
recipes_test = pd.read_json("test.json", orient = "records", dtype = {"cuisine" : "str", "id" : "int64", "ingredients" : "str"})

In [None]:
ingredients_test_features = vectorizer.transform(recipes_test["ingredients"])

In [None]:
ingredients_test_features.shape

In [None]:
test_predicted = clf.predict(ingredients_test_features.toarray()) 

In [None]:
test_df = pd.DataFrame({'id': recipes_test["id"], 'cuisine': test_predicted})
test_df.to_csv("result.csv", index=False, columns=("id", "cuisine"))