In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo

In [4]:
train = pd.read_json("train.json")
test = pd.read_json("test.json")
test.head()

Unnamed: 0,id,ingredients
0,10210,"[barbecue sauce, yellow onion, prepared pizza ..."
1,2310,"[olive oil, stewed tomatoes, hot sauce, chicke..."
2,33213,"[chopped bell pepper, cilantro leaves, white o..."
3,16902,"[tomatoes, purple onion, cumin seed, red lenti..."
4,9056,"[chile powder, lime, blackpepper, ground clove..."


In [5]:
# onehot encoder for ingredients
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
expandedLabelData_train = mlb.fit_transform(train["ingredients"])
labelClasses = mlb.classes_
expandedLabelData_test = mlb.fit_transform(test["ingredients"])
labelClasses_2 = mlb.classes_

expandedLabels_train = pd.DataFrame(expandedLabelData_train, columns=labelClasses)
expandedLabels_test = pd.DataFrame(expandedLabelData_test, columns=labelClasses_2)
print(expandedLabels_train)
print()
print(expandedLabels_test)

       (    oz.) tomato sauce  (   oz.) tomato paste  \
0                           0                      0   
1                           0                      0   
2                           0                      0   
3                           0                      0   
4                           0                      0   
...                       ...                    ...   
29769                       0                      0   
29770                       0                      0   
29771                       0                      0   
29772                       0                      0   
29773                       0                      0   

       (10 oz.) frozen chopped spinach  \
0                                    0   
1                                    0   
2                                    0   
3                                    0   
4                                    0   
...                                ...   
29769                            

In [9]:
# shuffle data
from numpy import nan

def shuffle(df_origin): 
    df = df_origin.sample(frac=1,random_state=113)#,random_state=113
    df2= df.reset_index(drop=True)
    return df2

df = pd.DataFrame(train)
df = pd.concat([df,expandedLabels_train], axis = 1)
df_shuffle_train = shuffle(df)
print(df_shuffle_train)

df = pd.DataFrame(test)
df = pd.concat([df,expandedLabels_test], axis = 1)
df_shuffle_test = shuffle(df)
print(df_shuffle_test)
df_shuffle_test.to_csv("shuffle_test.csv")

df_mix = pd.concat([df_shuffle_train, df_shuffle_test])
df_mix= df_mix.replace(nan, 0)
print(df_mix)


          id   cuisine                                        ingredients  \
0      45758   italian  [bread crumbs, ricotta cheese, fresh parsley, ...   
1       2065     greek  [large eggs, salt, dried oregano, plain yogurt...   
2        968    korean  [light brown sugar, dijon mustard, paprika, Go...   
3      30592   mexican  [shredded cheddar cheese, red bell pepper, gre...   
4      44223    indian  [jasmine rice, unsalted butter, grapeseed oil,...   
...      ...       ...                                                ...   
29769   3997    french  [cherries, all-purpose flour, sugar, refrigera...   
29770   4245  japanese  [avocado, crabmeat, sushi rice, nori, soy sauc...   
29771  36461   italian  [parmigiano reggiano cheese, crusty bread, par...   
29772  44094  japanese  [boneless chicken breast, salt, soy sauce, mus...   
29773  14160   italian  [tomato sauce, bacon, onions, butter, bow-tie ...   

       (    oz.) tomato sauce  (   oz.) tomato paste  \
0                  

In [10]:
#PCA for lower down dimensionality
from sklearn.decomposition import PCA
N_COMPONENTS = 512
pca = PCA(N_COMPONENTS)
pca.fit(df_mix[df_mix.columns[3:]])
df_mix_pca = pd.DataFrame(pca.transform(df_mix[df_mix.columns[3:]]))
df_mix_pca.head()

print(df_mix_pca)

            0         1         2         3         4         5         6    \
0      0.510982  0.088518 -0.672753 -0.158376  0.414495  0.268091 -0.137284   
1      0.274644  0.375971 -0.176782  0.065640  0.075968 -0.475731  0.255792   
2     -0.307932  0.051774  0.877637 -0.059074  0.309773  0.580567 -0.154620   
3     -0.559968 -0.159165 -0.164936 -0.188891 -0.156836 -0.321363 -0.230882   
4     -0.312332 -0.461955  0.375438 -0.446578  0.049873 -0.273498  0.153156   
...         ...       ...       ...       ...       ...       ...       ...   
39769 -0.794928  0.016763  0.615479  0.982461  0.703543  0.028578  0.929613   
39770  0.417789  1.046598  0.052702  0.086276 -0.335018  0.499626 -0.514245   
39771  0.446145  0.307237 -0.438216  0.144716  0.617088  0.304539  0.038731   
39772 -0.515761 -0.015798  0.253565  0.321376 -0.101324  0.153866 -0.802781   
39773 -0.600770 -0.027741 -0.127006 -0.146289 -0.086928 -0.206495 -0.233840   

            7         8         9    ...       502 

In [11]:
#transform cuisine
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(df_shuffle_train.cuisine)

df_shuffle_train.cuisine = le.transform(df_shuffle_train.cuisine)
print(df_shuffle_train.cuisine)
print(le.classes_)

0         9
1         6
2        12
3        13
4         7
         ..
29769     5
29770    11
29771     9
29772    11
29773     9
Name: cuisine, Length: 29774, dtype: int32
['brazilian' 'british' 'cajun_creole' 'chinese' 'filipino' 'french'
 'greek' 'indian' 'irish' 'italian' 'jamaican' 'japanese' 'korean'
 'mexican' 'moroccan' 'russian' 'southern_us' 'spanish' 'thai'
 'vietnamese']


In [12]:
# train_test_split
from sklearn.model_selection import train_test_split

"""df_shuffle_train_pca = df_mix_pca.iloc[:29774, :]
df_shuffle_test_pca = df_mix_pca.iloc[29774:39774, :]
print(df_shuffle_train_pca)
print(df_shuffle_test_pca)

train_x,test_x,train_y,test_y = train_test_split(df_shuffle_train_pca, df_shuffle_train.cuisine, random_state=42, test_size=0.30)
print(len(train_x))
print()
print(len(test_x))"""

df_shuffle_train_pca = df_mix.iloc[:29774, :].drop(['ingredients', 'id', 'cuisine'], axis=1)
df_shuffle_test_pca = df_mix.iloc[29774:39774, :].drop(['ingredients', 'id', 'cuisine'],axis=1)
print(df_shuffle_train_pca)
print(df_shuffle_test_pca)

train_x,test_x,train_y,test_y = train_test_split(df_shuffle_train_pca, df_shuffle_train.cuisine, random_state=42, test_size=0.30)
print(len(train_x))
print()
print(len(test_x))

       (    oz.) tomato sauce  (   oz.) tomato paste  \
0                           0                    0.0   
1                           0                    0.0   
2                           0                    0.0   
3                           0                    0.0   
4                           0                    0.0   
...                       ...                    ...   
29769                       0                    0.0   
29770                       0                    0.0   
29771                       0                    0.0   
29772                       0                    0.0   
29773                       0                    0.0   

       (10 oz.) frozen chopped spinach  \
0                                  0.0   
1                                  0.0   
2                                  0.0   
3                                  0.0   
4                                  0.0   
...                                ...   
29769                            

In [11]:
# validation--> SVM
from sklearn.svm import SVC

svm_clf_lin = SVC(random_state=42,kernel='linear')
svm_clf_poly = SVC(random_state=42,kernel='poly')
svm_clf_rbf = SVC(random_state=42,kernel='rbf')
svm_clf_sigmoid = SVC(random_state=42,kernel='sigmoid')

svm_clf_lin = svm_clf_lin.fit(train_x,train_y)
svm_clf_poly = svm_clf_poly.fit(train_x,train_y)
svm_clf_rbf = svm_clf_rbf.fit(train_x,train_y)
svm_clf_sigmoid = svm_clf_sigmoid.fit(train_x,train_y)

In [None]:
pred = []
pred.append(svm_clf_lin.predict(test_x))
pred.append(svm_clf_poly.predict(test_x))
pred.append(svm_clf_rbf.predict(test_x))
pred.append(svm_clf_sigmoid.predict(test_x))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

for i in range(len(pred)):
    print(i)
    fig, ax = plt.subplots(figsize=(15,15))
    ax = sns.heatmap(confusion_matrix(test_y, pred[i]),annot=True,cmap='coolwarm',fmt='d')
    print("recall: ", recall_score(test_y, pred[i], average='micro'))
    print("precision: ", precision_score(test_y, pred[i], average='macro'))
    print("accuracy: ", accuracy_score(test_y, pred[i]))
    print()
    plt.savefig(str(i))
    

In [36]:
sub = svm_clf_rbf.predict(df_shuffle_test_pca)

In [37]:
submit = le.inverse_transform(pd.Series(sub))
print(le.inverse_transform([3, 5, 1, 2]))

submit = pd.concat([pd.Series(df_shuffle_test.id, name='Id'), pd.Series(submit, name='Category')], axis=1)
print(submit.to_csv(index=False))
submit.to_csv("submission.csv", index = False)


['chinese' 'french' 'british' 'cajun_creole']
Id,Category
9356,italian
31575,italian
3603,southern_us
5099,mexican
13393,mexican
33105,vietnamese
18363,italian
10466,southern_us
25480,italian
4444,southern_us
11715,mexican
5766,moroccan
39856,filipino
6548,southern_us
23756,french
7334,southern_us
42644,chinese
47978,mexican
10103,italian
30801,mexican
6729,irish
42628,mexican
20048,cajun_creole
8633,italian
20666,mexican
30385,southern_us
27628,italian
24392,southern_us
37299,mexican
31384,japanese
35328,southern_us
40134,vietnamese
3808,mexican
10098,southern_us
31743,southern_us
30460,southern_us
13492,chinese
10293,chinese
48993,mexican
26671,italian
13373,italian
18795,italian
17332,italian
42217,southern_us
18034,southern_us
39009,chinese
44699,french
11109,mexican
614,mexican
2930,southern_us
35566,italian
13585,chinese
1528,mexican
15593,thai
32751,mexican
40295,chinese
478,cajun_creole
23240,mexican
10355,french
31805,indian
5047,southern_us
27299,french
28988,mexican
48956,fr

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

train_x = train_x.astype("float32")
train_y = train_y.astype("float32")
test_x = test_x.astype("float32")
test_y = test_y.astype("float32")
# Reserve 8933 samples for validation
val_x = train_x[-8933:]
val_y = train_y[-8933:]
train_x = train_x[:-8933]
train_y = train_y[:-8933]

In [None]:
n = ["Adadelta","Adagrad", "Adam", "Ftrl", "SGD", "RMSprop", "Nadam"]
m = [tf.keras.optimizers.Adadelta(0.01), tf.keras.optimizers.Adagrad(0.01), tf.keras.optimizers.Adam(0.01), tf.keras.optimizers.Ftrl(0.01), tf.keras.optimizers.SGD(0.01), tf.keras.optimizers.RMSprop(0.01), tf.keras.optimizers.Nadam(0.01)]
model_all = []

for i in range(len(m)):
    model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(512),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dense(20)
    ])

    model.compile(
        optimizer=m[i],
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
    )

    model.fit(
        train_x.values, train_y.values,
        epochs=20,
        validation_data=(val_x.values, val_y.values)
    )
    model_all.append(model)
    # model.summary()
    y = model.predict(test_x.values)
    pred = []
    for j in range(len(y)):
        pred.append(np.argmax(y[j]))
    # print(pred)

    fig, ax = plt.subplots(figsize=(15,15))
    ax = sns.heatmap(confusion_matrix(test_y, pred),annot=True,cmap='coolwarm',fmt='d')
    print(n[i])
    print("recall: ", recall_score(test_y, pred, average='micro'))
    print("precision: ", precision_score(test_y, pred, average='macro'))
    print("accuracy: ", accuracy_score(test_y, pred))
    print()
    plt.savefig('EMG {0}.jpg'.format(i)) 


In [48]:
y = model_all[1].predict(df_shuffle_test_pca)
pred = []
for j in range(len(y)):
    pred.append(np.argmax(y[j]))

print(pred)


[9, 9, 16, 16, 13, 19, 9, 2, 9, 16, 13, 14, 3, 16, 5, 16, 3, 4, 17, 13, 6, 13, 2, 9, 13, 7, 5, 14, 13, 11, 16, 19, 13, 16, 16, 7, 19, 11, 13, 9, 2, 9, 9, 16, 16, 3, 5, 9, 13, 9, 9, 7, 13, 18, 13, 11, 2, 13, 5, 7, 15, 5, 13, 16, 13, 13, 2, 15, 3, 3, 16, 10, 1, 9, 18, 13, 9, 5, 9, 7, 16, 11, 16, 5, 6, 6, 10, 9, 13, 13, 18, 6, 13, 11, 13, 12, 7, 9, 2, 5, 13, 7, 9, 17, 13, 11, 16, 13, 13, 12, 13, 17, 15, 13, 17, 12, 13, 9, 16, 5, 3, 14, 5, 9, 16, 6, 9, 9, 7, 2, 7, 9, 13, 7, 13, 16, 13, 13, 9, 13, 16, 19, 14, 13, 8, 14, 9, 9, 9, 2, 11, 14, 15, 3, 3, 14, 11, 9, 9, 9, 12, 13, 16, 9, 9, 3, 16, 9, 18, 11, 8, 3, 7, 16, 18, 13, 6, 3, 1, 7, 13, 8, 14, 12, 16, 9, 9, 9, 12, 3, 9, 13, 2, 9, 13, 13, 19, 2, 2, 3, 5, 13, 13, 9, 7, 9, 6, 2, 13, 16, 6, 16, 9, 11, 16, 13, 3, 9, 16, 2, 7, 13, 16, 2, 3, 18, 3, 13, 18, 3, 16, 13, 9, 9, 1, 3, 3, 7, 9, 17, 9, 5, 13, 9, 9, 9, 9, 11, 9, 9, 9, 2, 1, 9, 6, 16, 9, 3, 17, 16, 9, 17, 19, 9, 6, 16, 16, 1, 9, 16, 13, 5, 2, 9, 17, 9, 9, 13, 7, 3, 13, 3, 6, 8, 3, 5, 9, 18

In [50]:
submit_nn = le.inverse_transform(pd.Series(pred))
submit_nn = pd.concat([pd.Series(df_shuffle_test.id, name='Id'), pd.Series(submit_nn, name='Category')], axis=1)
print(submit_nn)
submit_nn.to_csv("submission_nn.csv", index = False)

         Id     Category
0      9356      italian
1     31575      italian
2      3603  southern_us
3      5099  southern_us
4     13393      mexican
...     ...          ...
9995  24775      chinese
9996   7449  southern_us
9997  19211      mexican
9998   8966   vietnamese
9999  29738      italian

[10000 rows x 2 columns]
