# Feature Engineering

Create and modify features!

In [3]:
# starting up a console attached to this kernel
%qtconsole
import os

# importing base code
os.chdir('/your-path/whats-cooking/code')
from base import *

# changing to competition dir
os.chdir('/your-path/whats-cooking')

In [4]:
# reading files
train = pd.read_json('./raw-data/train.json')
test = pd.read_json('./raw-data/test.json')

First, let us define the encoding for the cuisine labels.

In [None]:
# creating encoding
y = train['cuisine']
enc_dict = {}
for i, label in enumerate(y.value_counts().index):
    enc_dict[label] = i + 1

# saving
with open('./raw-data/enc_dict.txt', 'w') as f:
    f.write(str(enc_dict))

# replacing
print enc_dict
print y.replace(enc_dict)[0:10]

### Stemming and ingredient overlapping

The raw data has too much features, rendering computation ineficcient. Let us stem and process the ingredients names so we can reduce dimensionality.

#### Stemming, tokenizing and splitting

Stem all words after tokenizing: if a single ingredient has 3 words ('grilled chicken breast', for instance) it will be transformed into 3 separate stemmed tokens. 

In [None]:
# reading files
train = pd.read_json('./raw-data/train.json')
test = pd.read_json('./raw-data/test.json')

# stemmer
stm = SnowballStemmer('english').stem
tknz = nltk.word_tokenize

# train
for row, data in train.iterrows():
    new_ings = []
    for ing in data['ingredients']:
        for token in tknz(ing):
            new_ings.append(stm(token))
    train['ingredients'][row] = new_ings
    print 'progress:', row, '/', len(train)

# test
for row, data in test.iterrows():
    new_ings = []
    for ing in data['ingredients']:
        for token in tknz(ing):
            new_ings.append(stm(token))
    test['ingredients'][row] = new_ings  
    print 'progress:', row, '/', len(test)
    
# saving
train.to_json('./eng-data/stemmed-tknz/train.json', orient='records')
test.to_json('./eng-data/stemmed-tknz/test.json', orient='records')

#### Stemming, tokenizing, and joining back

In this case, a 3-word ingredient would have all of its words tokenized, stemmed and then joined back together. 

In [None]:
# reading files
train = pd.read_json('./raw-data/train.json')
test = pd.read_json('./raw-data/test.json')

# stemmer
stm = SnowballStemmer('english').stem
tknz = nltk.word_tokenize

# train
for row, data in train.iterrows():
    new_ings = []
    for ing in data['ingredients']:
        ing_words = []
        for token in tknz(ing):
            ing_words.append(stm(token))
        new_ings.append(' '.join(ing_words))
    train['ingredients'][row] = new_ings
    print 'progress:', row, '/', len(train)

# test
for row, data in test.iterrows():
    new_ings = []
    for ing in data['ingredients']:
        ing_words = []
        for token in tknz(ing):
            ing_words.append(stm(token))
        new_ings.append(' '.join(ing_words))
    test['ingredients'][row] = new_ings  
    print 'progress:', row, '/', len(test)
    
# saving
train.to_json('./eng-data/stemmed-joined/train.json', orient='records')
test.to_json('./eng-data/stemmed-joined/test.json', orient='records')

#### Stemming, tokenizing and building n-grams

First, we build 1-grams out of ingredients names. Then we stemmed and joined back ingredients to original names. 

Now, we build n-grams out of the names, and explode the number of features.

In [7]:
# reading the data
train = pd.read_json('./eng-data/stemmed-tknz/train.json')
test = pd.read_json('./eng-data/stemmed-tknz/test.json')

combi = pd.concat([train, test])
end_train = len(train)
end_test = len(train) + len(test)

do_nothing = lambda x: x 
tf_idf_vect = TfidfVectorizer(preprocessor=do_nothing,
                              ngram_range=(1,5),
                              min_df=8)

string_join = lambda x: ' '.join(x) 
corpus = map(string_join, combi['ingredients'])

train_mat = tf_idf_vect.fit_transform(corpus[0:end_train])
test_mat = tf_idf_vect.transform(corpus[end_train:end_test])

save_sparse_csr('./eng-data/stemmed-word5-min8/train.npz', train_mat)
save_sparse_csr('./eng-data/stemmed-word5-min8/test.npz', test_mat)