In [20]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
import json

# Load data

In [21]:
with open('train.json') as data_file:    
    all_train_data = json.load(data_file)
    all_train_labels = np.array([d["cuisine"] for d in all_train_data])
with open('test.json') as data_file:
    all_test_data = json.load(data_file)

# Ingredient List preprocessing

A few things I've tried:
- Custom stop words. The basic idea is to use a list of words that are commonly used in ingredients but have no significance to labelling. Results are mixed. I'm able to increase the score by adding certain words, but others decrease the score. The result changes if I re-slice the data. This tells me that I'm probably overfitting. With more work we might be able to come up with a generally useful list.
- Stemming: NLTK snowball. Marginal benefit, if any.
- n-grams: Some success here with 2-grams. The idea is that creating n-grams from the structured ingredient allows us to be smarter about it than a tokenizer with a bag of words would be. In particular, it allows us to restrict n-grams to within ingredients, rather than spanning ingredients, which assigns meaning to the ingredient order. There is also some improvement from extracting n-grams from the end of the ingredient and not the beginning, since ingredients with more than two words tend to start with less useful adjectives and end with more useful adjective/noun pairs.

In [22]:
stop_words = []
#stop_words = ['chopped', 'ground', 'dark', 'large', 'Italian', 'grated', 'sliced', 'salt', 'kosher',
# 'fresh', 'whole', 'minced', 'shredded']

# A baseline ingredient processing function that does nothing (for comparison)
def proc_ingredients_base(strings):
    return strings

# The ingredient processing function
def proc_ingredients(strings):
    # An empty list for the processed ingredients
    new_strings = []
    
    # 2-grams are useful, but any more than 2 degrades performance
    ngram_max = 2
    
    # Loop through ingredients
    for s in strings:
        # If we're going to use a stemmer and stop words, don't put the stemmer here, because
        # if it stems our stop words they won't be removed
        #s = stemmer.stem(s)
        
        # split ingredient into words and add words to list
        # (here's a better place to use the stemmer)
        #tokens = [stemmer.stem(t) for t in s.split() if t not in stop_words]
        tokens = [t for t in s.split() if t not in stop_words]
        
        # Add each individual token to the list
        new_strings.extend(tokens)
        
        # simulate n-grams (by concatenating words without spaces) within each ingredient
        n_original_tokens = len(tokens)
        if n_original_tokens > 1:
            for n in xrange(2, min(ngram_max + 1, n_original_tokens + 1)):
                # This takes all valid n-grams within the ingredient...
                #for i in xrange(n_original_tokens - n + 1):
                # This just takes the n-gram at the end of the ingredient...
                for i in xrange(max(0, n_original_tokens - n), n_original_tokens - n + 1):
                    new_strings.append("".join(tokens[i: i + n]))
                    
    return new_strings

# Classification Pipeline

Combine bag-of-words with other features extracted from the ingredient lists and pipe into a single classifier. Code loosely adapted from http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html.

In [23]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract all of the features we want from the recipe data"""
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, recipes):
        features = np.recarray(shape=(len(recipes),),
                               dtype=[('ingredient_text', object),
                                      ('ingredient_count', np.int32, (1,)),
                                      ('character_count', np.int32, (1,)),
                                      ('avg_word_length', np.float64, (1,)),
                                      ('avg_ing_length', np.float64, (1,)),
                                     ])
        for i, recipe in enumerate(recipes):
            ingredients = recipe['ingredients']
            features['ingredient_count'][i] = len(ingredients)
            words = ' '.join(ingredients)
            char_count = len(''.join(words.split()))
            features['character_count'][i] = char_count
            features['avg_word_length'] = float(char_count) / len(words.split())
            features['avg_ing_length'] = float(len(words.split())) / len(ingredients)
            features['ingredient_text'][i] = ' '.join(proc_ingredients(ingredients))
        return features
    
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key."""
    
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]    

pipeline = Pipeline([
    ('extract', FeatureExtractor()),
    
    ('union', FeatureUnion(
        transformer_list=[
            
            ('text', Pipeline([
                ('select', ItemSelector(key='ingredient_text')),
                ('tfidf', TfidfVectorizer(max_df=0.5)),
            ])),         

            ('ingredient_count', Pipeline([
                ('select', ItemSelector(key='ingredient_count')),
            ])),
                    
            ('character_count', Pipeline([
                ('select', ItemSelector(key='character_count')),
            ])),                    

            ('avg_word_length', Pipeline([
                ('select', ItemSelector(key='avg_word_length')),
            ])),
                    
            ('avg_ing_length', Pipeline([
                ('select', ItemSelector(key='avg_ing_length')),
            ])),
        ],
                
        # playing with the weights doesn't seem to help when regression is the classifier
        transformer_weights={
            'text': 1.0,
            'ingredient_count': 1.0,
            'character_count': 1.0,
            'avg_word_length': 1.0,
            'avg_ing_length': 1.0,
        },
    )),

    #('regression', GridSearchCV(LogisticRegression(), {'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]})),
    ('regression', LogisticRegression(C=10.0)),
    #('svc', SVC(kernel='linear')),
    #('svc', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
])

# For testing: generate pipeline score

In [24]:
train_data, dev_data, train_labels, dev_labels = train_test_split(all_train_data, 
                                                                  all_train_labels, 
                                                                  test_size=0.33, 
                                                                  random_state=12)

pipeline.fit(train_data, train_labels)
print pipeline.score(dev_data, dev_labels)

0.793691909188


# Analyze results

Calculate R value for each prediction and have a look at the mistakes. A couple of potential insights:
- The worst mistakes are pretty hopeless. Some of theme are non-descript, but a lot of them use the "wrong" ingredients for the label, e.g. Parmesan cheese in Chinese food. These are not necessarily authentic recipes--they are (probably Western) people's contributed interpretations of some type of cuisine, so they're bound to be weird and overlap.
- Some of the near misses have ingredients with more than 2 words in which all but the last couple of words are toss-aways, e.g. "ground black pepper", "hot pepper sauce", "light corn syrop", "grated lemon zest". Including some of these words in a stop word list is one way to go, but the general structure of the ingredients seems to be more important words towards the end. Hence the modification to n-gram construction above in which we just take the 2-gram from the end.

In [25]:
def print_r(R, indexes, pred, data, labels):
    for i in indexes:
        print R[i]
        print "Predicted: %s" % pred[i]
        print "Actual: %s" % labels[i]
        print '\n',
        print data[i]['ingredients']
        print '_' * 30, '\n'         # print separator       
        
# for testing: worst R values
predicted = pipeline.predict(dev_data)
probs = pipeline.predict_proba(dev_data)

# calculate R for each prediction
R = np.empty(probs.shape[0])
for i in range(probs.shape[0]):
    R[i] = max(probs[i]) / probs[i,np.where(pipeline.classes_ == dev_labels[i])]

# Find top 30 R values, i.e. the worst 30 predictions
worst_predictions = np.argpartition(R, -30)[-30:]
print "Worst predictions"
print "###############################\n"
print_r(R, worst_predictions, predicted, dev_data, dev_labels)

# Find 30 with the lowest R values that were not matched correctly
R[R == 1] = np.inf
nearest_misses = np.argpartition(R, 30)[:30]
print "Nearest misses"
print "###############################\n"
print_r(R, nearest_misses, predicted, dev_data, dev_labels)

Worst predictions
###############################

742.00695089
Predicted: british
Actual: french

[u'sugar', u'honey', u'buckwheat honey', u'garbanzo bean flour', u'flaxseed', u'sweet rice flour', u'pectin', u'tapioca flour', u'dry yeast', u'poppy seeds', u'buckwheat flour', u'double-acting baking powder', u'sponge', u'water', u'teff', u'sea salt', u'brown rice flour', u'psyllium husks', u'sunflower seeds', u'millet', u'grapeseed oil', u'gluten-free oat', u'oil']
______________________________ 

782.778091723
Predicted: southern_us
Actual: spanish

[u'sugar', u'unsalted butter', u'milk', u'all-purpose flour', u'warm water', u'salt', u'active dry yeast', u'white cornmeal']
______________________________ 

791.508030796
Predicted: mexican
Actual: filipino

[u'onion powder', u'ground cumin', u'ground black pepper', u'salt', u'garlic powder', u'paprika', u'chili powder', u'dried oregano']
______________________________ 

1075.63220963
Predicted: chinese
Actual: japanese

[u'hoisin sauce',

# For live run: predict and export

In [10]:
pipeline.fit(all_train_data, all_train_labels)
pred = pipeline.predict(all_test_data)
ids = np.array([d["id"] for d in all_test_data])
results = np.vstack((ids, pred)).T
np.savetxt("submission.csv", results, delimiter=",", fmt="%s", header="id,cuisine", comments="")