In [None]:
import pandas as pd
recipes = pd.read_json("train.json", orient = "records", dtype = {"cuisine" : "str", "id" : "int64", "ingredients" : "str"})
recipes.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

from nltk.tokenize.regexp import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.metrics import *
from nltk import pos_tag, word_tokenize
from sklearn.feature_extraction import text 

cooking_stop_words = ['fat', 'free', 'minced', 'ground', 'fresh', 'dried', 'chopped', 'large', 'small', 'cooked', 'purpose', 'kosher', 'extra', 
                      'low', 'sodium', 'baked', 'diced', 'minced', 'crushed', 'hot', 'cold', 'roasted', 'toasted', 'plain', 'warm']
all_stop_words = text.ENGLISH_STOP_WORDS.union(cooking_stop_words)

class StemTokenizer(object):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=1, token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z]+\b", strip_accents='ascii', stop_words=all_stop_words)
        self.analyzer = self.vectorizer.build_analyzer()
        self.stemmer = PorterStemmer()
        self.reference_words = []
    def __call__(self, doc):
        raw_tokens = self.analyzer(doc)
        return [self.stemmer.stem(t) for t in self.analyzer(doc)]
    
class StemNounsTokenizer(object):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=1, token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z]+\b", strip_accents='ascii', stop_words=all_stop_words)
        self.analyzer = self.vectorizer.build_analyzer()
        self.stemmer = PorterStemmer()
        self.reference_words = []
    def __call__(self, doc):
        raw_tokens = self.analyzer(doc)
        filtered_words = []
        for token,tag in pos_tag(raw_tokens):
            if tag != 'JJ':
                filtered_words.append(token)
        return [self.stemmer.stem(t) for t in filtered_words]    


vectorizer = CountVectorizer(min_df=2, tokenizer=StemTokenizer(), strip_accents='ascii', dtype=np.float32, max_features = 2000)
ingredients_features = vectorizer.fit_transform(recipes["ingredients"])
ingredients_features

In [None]:
vectorizer.get_feature_names()[0:1000]

In [None]:
token_counts = np.sum(ingredients_features.toarray(), axis=0)
features_counts = pd.DataFrame({'feature_name': vectorizer.get_feature_names(), 'feature_count': token_counts})
features_counts = features_counts.sort('feature_count', ascending=False)
features_counts.to_csv("features_with_counts.csv", index=False, columns=("feature_name", "feature_count"))

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(recipes["cuisine"])
recipes["cuisine_encoded"] = le.transform(recipes["cuisine"]) 

In [None]:
import xgboost as xgb
from sklearn.cross_validation import train_test_split

recipes_train_data_xgb, recipes_test_data_xgb, recipes_train_result_xgb, recipes_test_result_xgb = train_test_split(ingredients_features, recipes["cuisine_encoded"], test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(recipes_train_data_xgb.toarray(), label=recipes_train_result_xgb)
dtest = xgb.DMatrix(recipes_test_data_xgb.toarray(), label=recipes_test_result_xgb)

xgb_params['bst:max_depth'] = 40
xgb_params['bst:eta'] = 0.2
xgb_params['silent'] = 0
xgb_params['objective'] = 'multi:softmax'
xgb_params['num_class'] = 20
xgb_params['nthread'] = 2
xgb_params['eval_metric'] = 'merror'

evallist  = [(dtest,'eval'), (dtrain,'train')]

num_round = 200
bst = xgb.train(xgb_params, dtrain, num_round, evallist)

In [None]:
recipes_test = pd.read_json("test.json", orient = "records", dtype = {"cuisine" : "str", "id" : "int64", "ingredients" : "str"})
ingredients_test_features = vectorizer.transform(recipes_test["ingredients"])
ingredients_test_matrix = xgb.DMatrix(ingredients_test_features)

In [None]:
test_predicted = bst.predict(ingredients_test_matrix) 

In [None]:
test_predicted = np.array(test_predicted, dtype="int32")
predicted_labels = le.inverse_transform(test_predicted) 

In [None]:
test_df = pd.DataFrame({'id': recipes_test["id"], 'cuisine': predicted_labels})
test_df.to_csv("result_xgboost.csv", index=False, columns=("id", "cuisine"))