# Feature extraction

Let us try different combinations of feature extraction techniques.

In [None]:
# starting up a console attached to this kernel
%qtconsole
import os

# importing base code
os.chdir('/your-path/whats-cooking/code')
from base import *

# changing to competition dir
os.chdir('/your-path/whats-cooking')

In [None]:
# reading files
# also trying with stemmed data
path = './eng-data/stemmed-joined'
train = pd.read_json(path + '/train.json')
test = pd.read_json(path + '/test.json')

# read dict literal
with open(path + '/ing_dict.txt', 'r') as f:
    ing_dict = ast.literal_eval(f.read())

Let us test 3 different ideas: 

a) use term frequency counts as features <br>
b) tf-idf with D (space of documents) being the recipes <br>
c) tf-idf with D being the labels

### Term frequency counts

We can use the CountVectorizer class to perform this operation:

In [None]:
# dummy function, as we want to override the sklearn analyser
# and use what is inside the existing lists as tokens
do_nothing = lambda x: x 

# this instance will count the word's frequencies
cvect = CountVectorizer(analyzer=do_nothing,
                        vocabulary=ing_dict)
# getting corpus
combi = pd.concat([train, test])
corpus = combi['ingredients']

# build count matrix
count_mat = cvect.transform(corpus)

# test/train split points
end_train = len(train)
end_test = len(combi)

# split and save
save_sparse_csr(path + '/counts-train.npz', count_mat[0:end_train])
save_sparse_csr(path + '/counts-test.npz', count_mat[end_train:end_test])

### tf-idf with recipes as documents

Same as before, but using TfidfVectorizer.

In [None]:
# dummy function, as we want to override the sklearn analyser
# and use what is inside the existing lists as tokens
do_nothing = lambda x: x 

# this instance will count the word's frequencies
tf_idf_vect = TfidfVectorizer(analyzer=do_nothing,
                              vocabulary=ing_dict)
# getting corpus
combi = pd.concat([train, test])
corpus = combi['ingredients']

# build count matrix
tfidf_mat = tf_idf_vect.fit_transform(corpus)

# test/train split points
end_train = len(train)
end_test = len(combi)

# split and save
save_sparse_csr(path + '/tfidf-train.npz', tfidf_mat[0:end_train])
save_sparse_csr(path + '/tfidf-test.npz', tfidf_mat[end_train:end_test])

### tf-idf with labels as documents

Now, we want the counts for each recipe, but we want to weigh the terms with respect to their presence in the cuisines.

In [None]:
# we've already got the counts matrix
print count_mat[0]

How many different cuisines there are?

In [None]:
print train['cuisine'].value_counts()
print 'number of cuisines:', len(train['cuisine'].value_counts())

Let us first build a dictionary with ingredients relating to their cuisine.

In [None]:
cuisine_dict = {}
# writing one column per ingredient
# iterating over rows is bad practice, but this is a small dataset
for row, data in train.iterrows():
    for ingredient in data['ingredients']:
        try:
            if data['cuisine'] not in cuisine_dict[ingredient]:
                cuisine_dict[ingredient].append(data['cuisine'])
        except KeyError:
            cuisine_dict[ingredient] = [data['cuisine']]

Let us now build a frequency dictionary for ingredients: In how many cuisines has the ingredient appeared?

In [None]:
docfreq_dict = {}
for key in cuisine_dict.keys():
    docfreq_dict[key] = len(cuisine_dict[key]) 

# ordering the dict 
import operator
sorted_docfreqs = sorted(docfreq_dict.items(), key=operator.itemgetter(1), reverse=True)

Let us visualize these:

In [None]:
print '25 ingredients who appear most throughout cuisines:'
print sorted_docfreqs[0:25]
print '25 ingredients who appear less throughout cuisines:'
print sorted_docfreqs[-25:-1]

Now, idf(t) = log N / (1 + D_t). <br>

N = number of documents (in this case cuisines). <br>
D_t = number of documents where term t appears.

Let us weigh the counts matrix:

In [None]:
# we will need the ingredients dictionary
with open(path + '/ing_dict.txt', 'r') as f:
    ing_dict = ast.literal_eval(f.read())

from __future__ import division    
cuisinefreqs_mat = copy.deepcopy(count_mat)    
    
# each column will have the same weight - term weight
for term in ing_dict.keys():
    weight = 1 + np.log(20/(1 + docfreq_dict[term]))
    nonzero_terms = cuisinefreqs_mat[:, ing_dict[term]].nonzero()[0]
    cuisinefreqs_mat[nonzero_terms, ing_dict[term]] = weight

Save our third representation:

In [None]:
# test/train split points
end_train = len(train)
end_test = len(combi)

# split and save
save_sparse_csr(path + '/cuisinefreqs-train.npz', cuisinefreqs_mat[0:end_train])
save_sparse_csr(path + '/cuisinefreqs-test.npz', cuisinefreqs_mat[end_train:end_test])

Check differences between our representations:

In [None]:
# visualize all representations
print count_mat[0]
print tfidf_mat[0]
print cuisinefreqs_mat[0]