In [1]:
__author__ = 'satish'
import numpy as np
import pandas as pd

#Read Data
Using Pandas read csv method to read the data. Since there are no header columns , we read in the data as is and then later split the data using a tab as the delimiter. 

In [2]:
epic_df = pd.read_csv('Project_Cuisine/epic_recipes.txt',names=['col'],header=None)
epic_df.head(5)

Unnamed: 0,col
0,Vietnamese\tvinegar\tcilantro\tmint\tolive_oil...
1,Vietnamese\tonion\tcayenne\tfish\tblack_pepper...
2,Vietnamese\tgarlic\tsoy_sauce\tlime_juice\ttha...
3,Vietnamese\tcilantro\tshallot\tlime_juice\tfis...
4,Vietnamese\tcoriander\tvinegar\tlemon\tlime_ju...


##Data Cleaning 

Since there are 3 files to read in, writing a generic method that reads in the file using pandas, cleans it up, and splits based on the tab and creates a data frame to return. Method returns a list of data frames for the lsit of files passed in

In [3]:
def clean_data(filenames):
    dfs=[]
    for filename in filenames:
        epic_df = pd.read_csv(filename,names=['col'],header=None)
        epic_df['cuisine']=epic_df['col'].apply(lambda x : x.split('\t')[0])
        epic_df['ingredients'] = epic_df['col'].apply(lambda x:(',').join (x.split('\t')[1:]))
        epic_df.drop('col',inplace=True,axis=1)
        dfs.append(epic_df)
    return dfs


##Load Data

Loading data from all three files into three different data frames.


In [4]:
datas= clean_data(['Project_Cuisine/epic_recipes.txt','Project_Cuisine/allr_recipes.txt','Project_Cuisine/menu_recipes.txt'])
for data in datas:
    print data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13408 entries, 0 to 13407
Data columns (total 2 columns):
cuisine        13408 non-null object
ingredients    13408 non-null object
dtypes: object(2)
memory usage: 314.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 41825 entries, 0 to 41824
Data columns (total 2 columns):
cuisine        41825 non-null object
ingredients    41825 non-null object
dtypes: object(2)
memory usage: 980.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2458 entries, 0 to 2457
Data columns (total 2 columns):
cuisine        2458 non-null object
ingredients    2458 non-null object
dtypes: object(2)
memory usage: 57.6+ KB
None


###Create the uber data set
Concatenate the three data frames created from each of the file above into a single data frame.

In [8]:
big_df= pd.concat(datas)
# Import train_test_split
#from sklearn.cross_validation import train_test_split
# Split the data into a 70/30 train/test split
#X_train, X_test, y_train, y_test = train_test_split(big_df,big_df.cuisine, test_size=0.3)

In [9]:
def standardize_cuisine_names(cuisine):
    return {
        'italian':'Italian',
        'asian':'Asian',
        'mexico':'Mexico',
        'japanese':'Japanese',
        'chinese':'Chinese',
        'China'  :'Chinese', 
        'korean' : 'Korean',
        'Japan':'Japanese',
        'Korea':'Korean',
        'France' :'French',
        'India'  :'Indian',
        'Italy'  :'Italian',
        'Thailand' :'Thai',
        'Mexico':'Mexican',
        'Scandinavia':'Scandinavian',
        'Germany':'German'
        
    }.get(cuisine,cuisine)

In [10]:
big_df.cuisine= big_df.cuisine.apply(lambda x : standardize_cuisine_names(x))

In [11]:
map_dict = {}
with open('Project_Cuisine/map.txt') as f:
    for line in f:
        keys = line.split()
        if(len(keys)>1):
            (key,val)=keys
            map_dict[key]=val


In [13]:
big_df['cuisine_group']= big_df.cuisine.apply(lambda x : map_dict.get(x))

In [14]:
big_df.head(3)

Unnamed: 0,cuisine,ingredients,cuisine_group
0,Vietnamese,"vinegar,cilantro,mint,olive_oil,cayenne,fish,l...",SoutheastAsian
1,Vietnamese,"onion,cayenne,fish,black_pepper,seed,garlic",SoutheastAsian
2,Vietnamese,"garlic,soy_sauce,lime_juice,thai_pepper",SoutheastAsian


In [15]:
# Import train_test_split
from sklearn.cross_validation import train_test_split
# Split the data into a 70/30 train/test split
X_train, X_test, y_train, y_test = train_test_split(big_df.ingredients, big_df.cuisine_group, test_size=0.3)

##Vectorization
The features in this data set are the ingredients of the dish themselves. To build the feature vectors, we need to vectorize these words/features into a feature vector. Since this is text, we use a count vectorizer to do that. Now in this case since an ingredient will either occur or not in a dish ,this fits the Boolean Occurence model of the Bag of Words model. The appropriate vectorizer to choose in this case would be the Binary Vectorizer..essentially setting the binary property of CountVectorizer to true.


In [18]:
from sklearn.feature_extraction.text import CountVectorizer
# Create a TFIDF vectorizer
count_vect = CountVectorizer(decode_error = 'ignore', binary=True)
# Call fit to do our frequency vectorization
count_vect.fit(X_train)
train_simple_dtm = count_vect.transform(X_train)
test_simple_dtm = count_vect.transform(X_test)

##Lets see how the prediction works for the cuisine itself 
unique_cuisine_group = big_df.cuisine_group.unique().tolist()
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
le.fit(unique_cuisine_group)
y_train =  le.transform(y_train)
y_test =  le.transform(y_test)

###Feature Vector Array
We now convert the feature vector array to a dataframe 

In [19]:
# Import
from sklearn.naive_bayes import BernoulliNB
# Create the model
bnb = BernoulliNB()
# Fit the model to the training data
bnb.fit(train_simple_dtm, y_train)
# Score the model against the test data
bnb.score(test_simple_dtm, y_test)

0.68679223480471463

In [20]:
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_score
from sklearn import clone
from sklearn.datasets import load_iris
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,
                              AdaBoostClassifier)
from sklearn.externals.six.moves import xrange
from sklearn.tree import DecisionTreeClassifier
ensemble_r = RandomForestClassifier()
ensemble_r.fit(train_simple_dtm,y_train)
#y_pred = ensemble_r.predict(test_simple_dtm)
ensemble_r.score(test_simple_dtm,y_test)

#print cross_val_score(ensemble_r,test_simple_dtm,y_cuisine).mean()
#confusion_matrix(y_test,y_pred)

0.7394268546336954

In [22]:
ensemble_d = DecisionTreeClassifier(max_depth=10,min_samples_split=1)
ensemble_d.fit(train_simple_dtm,y_train)
print ensemble_d.score(test_simple_dtm,y_test)


ensemble_r = RandomForestClassifier(n_estimators=10)
ensemble_r.fit(train_simple_dtm,y_train)
print ensemble_r.score(test_simple_dtm,y_test)


ensemble_e = ExtraTreesClassifier(n_estimators=10)
ensemble_e.fit(train_simple_dtm,y_train)
print ensemble_e.score(test_simple_dtm,y_test)

0.739484631384
0.737751328865
0.738964640629


#### Feature Engineering 


In [23]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

estimators = [('reduce_dim',PCA(copy=True,n_components=10,whiten=False)),('svm',SVC())]
clf = Pipeline(estimators)


In [25]:
clf.fit(train_simple_dtm.toarray(),y_train)

Pipeline(steps=[('reduce_dim', PCA(copy=True, n_components=10, whiten=False)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [26]:
clf.score(test_simple_dtm.toarray(),y_test)

0.73388028657268312

In [32]:
from time import time
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import RandomizedPCA


# Compute a PCA (eigenfaces) on the face dataset
# Keep 50 principal components
n_components = 50

print "Extracting the top %d eigenfaces from %d faces" % (
    n_components, train_simple_dtm.toarray().shape[1])
t0 = time()
# Create a RandomizedPCA with n_components and fit it to X_train
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(train_simple_dtm.toarray())
print "done in %0.3fs" % (time() - t0)


Extracting the top 50 eigenfaces from 377 faces
done in 1.087s


In [34]:
print "Projecting the input data on the eigenfaces orthonormal basis"
t0 = time()
# Transform X_train to X_train_pca
X_train_pca = pca.transform(train_simple_dtm.toarray())
# Transform X_test to X_test_pca
X_test_pca = pca.transform(test_simple_dtm.toarray())
print "done in %0.3fs" % (time() - t0)

Projecting the input data on the eigenfaces orthonormal basis
done in 0.261s


In [37]:
# Generate a param_grid for GridSearchCV
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }

In [None]:
# Train a SVM classification model
from sklearn.grid_search import GridSearchCV
print "Fitting the classifier to the training set"
t0 = time()
# Create GridSearchCV
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
# Fit the model
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator_

In [None]:
# Quantitative evaluation of the model quality on the test set

print "Predicting the people names on the testing set"
t0 = time()
# Generate test predictions as y_pred
y_pred = clf.predict(X_test_pca)
print "done in %0.3fs" % (time() - t0)
# Print classification_report
print classification_report(y_test, y_pred, target_names=target_names)
# Print confusion_matrix
print confusion_matrix(y_test, y_pred, labels=range(n_classes))