# bag of word

In [1]:
# read in dataset
import pandas as pd
df=pd.read_json(r'E:\what is cooking\train.json')
df2 = pd.DataFrame([x for x in df['ingredients'].apply(lambda item: dict(map(lambda x: (x,1),item))).values]).fillna(0)
data=df2.join(df[['cuisine','id']])

#word2vec

In [2]:
from gensim.models import Word2Vec
num_features = 600    # Word vector dimensionality
min_word_count = 1   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

model = Word2Vec(df['ingredients'].values, workers=num_workers, size=num_features, min_count = min_word_count,
                 window = context, sample = downsampling, seed=1)

# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)



In [3]:
import numpy as np
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0
    index2word_set = set(model.index2word)
    for word in words:
        if word in index2word_set:
            featureVec = np.add(featureVec,model[word])
            nwords+=1
    if nwords:
        featureVec = np.divide(featureVec,nwords)
        return featureVec
    else:
        return np.zeros((num_features,),dtype="float32")
def getAvgFeatureVecs(ingredients, model, num_features):
    ingredientFeatureVecs = np.zeros((len(ingredients),num_features),dtype="float32")
    for index,ingredient in enumerate(ingredients):
        ingredientFeatureVecs[index] = makeFeatureVec(ingredient, model, num_features)
    return ingredientFeatureVecs


X_word2vec=getAvgFeatureVecs(df['ingredients'].values, model, num_features)

# split training and testing data

In [4]:
# generating the ingredient to index pair
ingredients=[]
for index, ingredient in enumerate(data.columns.values):
    ingredients.append((index,ingredient))
ingredients=dict(ingredients)

In [5]:
# generating data labels, denoted by Y
d=[]
cuisines=data['cuisine'].unique()
for index,value in enumerate(cuisines):
    d.append((value,index))
d=dict(d)
label=data['cuisine'].apply(lambda x: d[x])
Y=label.values

In [6]:
# generating the feature vectors, denoted by X
from scipy import sparse
X=sparse.csr_matrix(data.iloc[:,0:6714].values)

In [7]:
# saving memory
del data,df2

In [8]:
# split dataset into training and testing
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
# X_word2vec_train, X_word2vec_test, Y_word2vec_train, Y_word2vec_test = train_test_split(X_word2vec, Y, 
#                                                                                         test_size=0.25, random_state=42)

Prepare data for combinations

In [None]:
#Building covariance matrix for each restraurants
from sklearn.covariance import empirical_covariance 
combinations=[]
for cuisine in range(0,20):
    #print 'Cuisine:',cuisine,'\n'    
    chinese=[]
    for i in range(0,39774):
        if Y[i]==cuisine:
            chinese.append(X[i])
    C=empirical_covariance(chinese)
    one=np.amax(C)
    C=C/one
#Merge top combinations into one matrix
    for i in range(0,6714):
        for j in range(i+1,6714):
            if (C[i][j]>0.05):                
                #print 'index is [',i,',',j,']\n',ingredients[i],',',ingredients[j],'\n'
                combinations.append([i,j])
            
    #print '********************************************************\n'
#Get rid of the repeated values
combinations= [combinations[i] for i in range(len(combinations)) if combinations[i] not in combinations[:i]]
#Adding combinations for original matrix
combinedTrain=[]
for j in range(0,39774):    
    combinedTrain.append(X[j])
    num=len(combinations)
    for i in range(0,num):
        if (X[j][combinations[i][0]]==1) and (X[j][combinations[i][1]]==1) :
            combinedTrain[j]=np.append(combinedTrain[j],[1])
        else:
            combinedTrain[j]=np.append(combinedTrain[j],[0])
Ctrain=combinedTrain[0:30000]
Ctest=combinedTrain[30000:39774]

PCA

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
pca=PCA()
pca.fit(X)
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')


# fitting bag of word model into different estimator

In [None]:
# SVM
from sklearn import svm, grid_search
parameters = {'criterion':('gini', 'entropy'), 'C':[1e1,1e2,1e3,1e4]}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters)
clf.fit(X_train,Y_train) 
print clf.grid_scores_ , ' \n ','Best Estimator',clf.best_estimator_

#best svm
best_svr = svm.SVC(C=1000)
best_svr.fit(X_train,Y_train) 
sum(svr.predict(X_test)==Y_test) # 0.76

In [20]:
# random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, grid_search
parameters = {'criterion':('gini', 'entropy'), 'min_samples_split':[2,5,10,20],'min_samples_leaf':[1]}
randomF = RandomForestClassifier(n_estimators=100,oob_score=True,n_jobs=-1,
                                 max_features='auto')

randomF.fit(X_train,Y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [8]:
sum(randomF.predict(X_test)==Y_test) #0.716

7120

In [10]:
# best logistic regression
from sklearn.linear_model import LogisticRegression

clf1 = LogisticRegression(C=1.5,random_state=1)
clf1.fit(X_train,Y_train)  #0.7846326

LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# ensemble learning(majority vote)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn import cross_validation

clf1 = LogisticRegression(C=1.5,random_state=1)
clf2 = RandomForestClassifier(n_estimators=100,oob_score=True,n_jobs=-1,
                                 max_features='auto',random_state=1)
clf3 = svm.SVC(C=1000)

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svm', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'SVM', 'Ensemble']):
    scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.77 (+/- 0.00) [Logistic Regression]
Accuracy: 0.70 (+/- 0.00) [Random Forest]
Accuracy: 0.75 (+/- 0.00) [SVM]
Accuracy: 0.76 (+/- 0.00) [Ensemble]


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn import cross_validation

clf1 = LogisticRegression(C=1.5,random_state=1)
clf2 = RandomForestClassifier(n_estimators=100,oob_score=True,n_jobs=-1,
                                 max_features='auto',random_state=1)
clf3 = svm.SVC(C=1000,probability=True)

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svm', clf3)], voting='soft')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'SVM', 'Ensemble']):
    scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=2, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.76 (+/- 0.00) [Logistic Regression]
Accuracy: 0.68 (+/- 0.00) [Random Forest]
Accuracy: 0.73 (+/- 0.00) [SVM]
Accuracy: 0.76 (+/- 0.00) [Ensemble]


# ensemble learning(boosting)

In [62]:
from sklearn import ensemble
gbc = ensemble.GradientBoostingClassifier(learning_rate=0.001,
                                          max_depth=3,n_estimators=4000)
gbc.fit(X_train, Y_train)

GradientBoostingClassifier(init=None, learning_rate=0.001, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=4000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [63]:
sum(gbc.predict(X_test.toarray())==Y_test)

6467

In [12]:
#best so far
sum(gbc.predict(X_test.toarray())==Y_test)#learning_rate=0.05, max_depth=3,n_estimators=1500

7542

# fitting word2vec model into different estimators

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm, grid_search

parameters = {'weights':('uniform','distance'), 'n_neighbors':[7,9,11,13]}
neigh = KNeighborsClassifier()
neigh_word2vec = grid_search.GridSearchCV(neigh, parameters)
neigh_word2vec.fit(X_word2vec_train,Y_word2vec_train) 
print neigh_word2vec.grid_scores_ , ' \n ','Best Estimator',neigh_word2vec.best_estimator_
# best knn
best_knn= KNeighborsClassifier(n_neighbors=11,weights='distance')
best_knn.fit(X_word2vec_train,Y_word2vec_train)
sum(best_knn.predict(X_word2vec_test)==Y_word2vec_test)   #0.66

In [None]:
# SVM
from sklearn import svm, grid_search
parameters = {'C':[1e6,1e7],'kernal':('rbf','sigmoid')}
svr_word2vec = svm.SVC()
clf_word2vec = grid_search.GridSearchCV(svr_word2vec, parameters)
clf_word2vec.fit(X_word2vec_train,Y_word2vec_train) 
print clf_word2vec.grid_scores_ , ' \n ','Best Estimator',clf_word2vec.best_estimator_

# best SVM
best_knn= svm.SVC()(C=100000,kernal='rbf')
best_knn.fit(X_word2vec_train,Y_word2vec_train)
sum(best_knn.predict(X_word2vec_test)==Y_word2vec_test) #0.69779 

In [None]:
#Logistic Regression
from sklearn import linear_model, datasets,grid_search

parameters = {'penalty':('l1', 'l2'), 'C':[1,10,100,1000]}
lr = linear_model.LogisticRegression()
clr = grid_search.GridSearchCV(lr, parameters)
clr.fit(X_word2vec_train,Y_word2vec_train) 
print clr.grid_scores_ , ' \n ','Best Estimator',clr.best_estimator_

#best logistic Regression

best_lr=LogisticRegression(C=100,penalty='l1')
best_lr.fit(X_word2vec_train,Y_word2vec_train)
sum(best_lr.predict(X_word2vec_test)==Y_word2vec_test) #0.68414

#neural network

In [73]:
from pybrain.datasets import ClassificationDataSet
from pybrain.tools.shortcuts import buildNetwork
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.structure.modules import SoftmaxLayer, TanhLayer,SigmoidLayer

ds_train = ClassificationDataSet(X_word2vec_train.shape[1],1,nb_classes=20)

for i in range(len(X_word2vec_train)):
    ds_train.addSample(X_word2vec_train[i],Y_train[i])

ds_train._convertToOneOfMany()
fnn = buildNetwork(X_word2vec_train.shape[1], 160, 20, 
                   hiddenclass=SigmoidLayer,outclass=SoftmaxLayer)
trainer = BackpropTrainer(fnn, ds_train)
for i in range(3):
    error = trainer.train()
    print "Epoch: %d, Error: %7.4f" % (i, error)

Epoch: 0, Error:  0.0151
Epoch: 1, Error:  0.0134
Epoch: 2, Error:  0.0130


In [74]:
from sklearn.metrics import accuracy_score
ypreds = []
ytrues = []
for i in range(X_word2vec_test.shape[0]):
    pred = fnn.activate(X_word2vec_test[i, :])
    ypreds.append(pred.argmax())
    ytrues.append(Y_word2vec_test[i])
sum(np.array(ypreds)==np.array(ytrues))

6114

In [67]:
for i in range(10):
    error = trainer.train()
    print "Epoch: %d, Error: %7.4f" % (i, error)

Epoch: 0, Error:  0.0118
Epoch: 1, Error:  0.0118
Epoch: 2, Error:  0.0118
Epoch: 3, Error:  0.0117
Epoch: 4, Error:  0.0117
Epoch: 5, Error:  0.0117
Epoch: 6, Error:  0.0116
Epoch: 7, Error:  0.0116
Epoch: 8, Error:  0.0116
Epoch: 9, Error:  0.0116


# network in cuisines

In [23]:
X_test.toarray()[np.where((Y_test==2) | (Y_test==4))].shape
np.array(map(lambda x: clf1.coef_[x],Y_test[np.where((Y_test==cuisine1) | (Y_test==cuisine2))])).shape

(1363L, 6714L)

In [27]:
# calculating the cosine similarity matrix
import numpy
# choose two cusines to draw the network
cuisine1=17
cuisine2=4
cuisine3=15


# multiplying the bag of word models with the feature weight given by logistic regress
dataframe=X_test.toarray()[np.where((Y_test==cuisine1) | (Y_test==cuisine2) | (Y_test==cuisine3))]*np.array(map(lambda x: clf1.coef_[x],Y_test[np.where((Y_test==cuisine1) | (Y_test==cuisine2)| (Y_test==cuisine3))]))

# dataframe=X_test.toarray()[np.where((Y_test==cuisine1) | (Y_test==cuisine2) )]*np.array(map(lambda x: clf1.coef_[x],Y_test[np.where((Y_test==cuisine1) | (Y_test==cuisine2))]))

#without multiplying the coefficient
# dataframe=X_test.toarray()[np.where((Y_test==cuisine1) | (Y_test==cuisine2))]
# dot production of the matrix
similarity=dataframe.dot(dataframe.T)

# find the inverse of the square root of the diagnoal elements
square_mag = np.diag(similarity)
inv_square_mag = 1.0 / square_mag
inv_square_mag[numpy.isinf(inv_square_mag)] = 0
inv_mag = numpy.sqrt(inv_square_mag)


del dataframe, inv_square_mag
# computing the cosine similarity
cosine_similarity=similarity*inv_mag

cosine_similarity=cosine_similarity.T*inv_mag

In [28]:
#choose a threshold value, below which we assume the restaurants are not connected

# cosine_similarity[cosine_similarity<0.5]=0
# cosine_similarity[np.where((cosine_similarity>0.5))]=1

# generating the network
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx
import random


g=nx.Graph(np.array([i[:] for i in cosine_similarity[:]]))
# to calculate the connection between each restaurant with the spring model
# however, this is not feasible as the size is too big

# pos=nx.spring_layout(g,pos=None,dim=2,iterations=1000,scale=100,k=2)

new_d={}
for key,value in d.iteritems():
    new_d[value]= key

# assign different color to different cuisin
new_cuisine=[new_d[i] for i in [cuisine1,cuisine2,cuisine3]]
# new_cuisine=[new_d[i] for i in [cuisine1,cuisine2]]
val_map = dict([(c, index) for index, c in enumerate(new_cuisine)])
ColorLegend=dict([(c, index) for index, c in enumerate(new_cuisine)])
values=[val_map[new_d[Y_test[np.where((Y_test==cuisine1) | (Y_test==cuisine2)| (Y_test==cuisine3))][node]]] for node in g.nodes()]
# values=[val_map[new_d[Y_test[np.where((Y_test==cuisine1) | (Y_test==cuisine2))][node]]] for node in g.nodes()]


# Color mapping
jet =cm= plt.get_cmap('jet')
cNorm  = colors.Normalize(vmin=0, vmax=max(values))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)

# Using a figure to use it as a parameter when calling nx.draw_networkx
f = plt.figure(1)
ax = f.add_subplot(1,1,1)
for label in ColorLegend:
    ax.plot([0],[0],color=scalarMap.to_rgba(ColorLegend[label]),label=label)

# Just fixed the color map
nx.draw_networkx(g, cmap = jet, vmin=0, vmax= max(values),node_color=values,with_labels=False,ax=ax,
                 arrows=False,node_size=30,alpha=0.5,width=0.0)

# Setting it to how it was looking before.                                                                                                              
# plt.axis('off')
f.set_facecolor('w')

plt.legend(loc='best',bbox_to_anchor=(1.3,1),borderpad=0.2)
f.tight_layout()
plt.show()
# f.savefig(r'C:\Users\zhanlong\Desktop\cuisine.png',dpi=1000)

In [42]:
new_d={}
for key,value in d.iteritems():
    new_d[value]= key

In [44]:
new_cuisine=[new_d[i] for i in [cuisine1,cuisine2]]

In [49]:
new_d[Y_test[np.where((Y_test==cuisine1) | (Y_test==cuisine2))][2]]

u'chinese'

In [53]:
d

{u'brazilian': 13,
 u'british': 9,
 u'cajun_creole': 12,
 u'chinese': 8,
 u'filipino': 2,
 u'french': 14,
 u'greek': 0,
 u'indian': 3,
 u'irish': 16,
 u'italian': 6,
 u'jamaican': 4,
 u'japanese': 15,
 u'korean': 17,
 u'mexican': 7,
 u'moroccan': 18,
 u'russian': 19,
 u'southern_us': 1,
 u'spanish': 5,
 u'thai': 10,
 u'vietnamese': 11}