In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo

In [3]:
train = pd.read_json("train.json")
test = pd.read_json("test.json")
# test.describe()

In [4]:
# onehot encoder for ingredients
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
expandedLabelData_train = mlb.fit_transform(train["ingredients"])
labelClasses = mlb.classes_
expandedLabelData_test = mlb.fit_transform(test["ingredients"])
labelClasses_2 = mlb.classes_

expandedLabels_train = pd.DataFrame(expandedLabelData_train, columns=labelClasses)
expandedLabels_test = pd.DataFrame(expandedLabelData_test, columns=labelClasses_2)
print(expandedLabels_train)
print()
print(expandedLabels_test)

       (    oz.) tomato sauce  (   oz.) tomato paste  \
0                           0                      0   
1                           0                      0   
2                           0                      0   
3                           0                      0   
4                           0                      0   
...                       ...                    ...   
29769                       0                      0   
29770                       0                      0   
29771                       0                      0   
29772                       0                      0   
29773                       0                      0   

       (10 oz.) frozen chopped spinach  \
0                                    0   
1                                    0   
2                                    0   
3                                    0   
4                                    0   
...                                ...   
29769                            

In [15]:
# shuffle data
from numpy import nan

def shuffle(df_origin): 
    df = df_origin.sample(frac=1,random_state=113)#,random_state=113
    df2= df.reset_index(drop=True)
    return df2

df = pd.DataFrame(train.cuisine)
df = pd.concat([df,expandedLabels_train], axis = 1)
df_shuffle_train = shuffle(df)
# print(df_shuffle_train)

df = pd.DataFrame(test.id)
df = pd.concat([df,expandedLabels_test], axis = 1)
df_shuffle_test = shuffle(df)
# print(df_shuffle_test)

df_mix = pd.concat([df_shuffle_train, expandedLabels_test])
df_mix= df_mix.replace(nan, 0)
print(df_mix)


      cuisine  (    oz.) tomato sauce  (   oz.) tomato paste  \
0     italian                       0                    0.0   
1       greek                       0                    0.0   
2      korean                       0                    0.0   
3     mexican                       0                    0.0   
4      indian                       0                    0.0   
...       ...                     ...                    ...   
9995        0                       0                    0.0   
9996        0                       0                    0.0   
9997        0                       0                    0.0   
9998        0                       0                    0.0   
9999        0                       0                    0.0   

      (10 oz.) frozen chopped spinach  \
0                                 0.0   
1                                 0.0   
2                                 0.0   
3                                 0.0   
4                         

In [17]:
#PCA for lower down dimensionality
from sklearn.decomposition import PCA
N_COMPONENTS = 512
pca = PCA(N_COMPONENTS)
pca.fit(df_mix[df_mix.columns[1:]])
df_mix_pca = pd.DataFrame(pca.transform(df_mix[df_mix.columns[1:]]))
df_mix_pca.head()

print(df_mix_pca)

            0         1         2         3         4         5         6    \
0      0.510982  0.088518 -0.672753 -0.158376  0.414495  0.268091 -0.137284   
1      0.274644  0.375971 -0.176782  0.065640  0.075968 -0.475731  0.255792   
2     -0.307932  0.051774  0.877637 -0.059074  0.309773  0.580567 -0.154620   
3     -0.559968 -0.159165 -0.164936 -0.188891 -0.156836 -0.321363 -0.230882   
4     -0.312332 -0.461955  0.375438 -0.446578  0.049873 -0.273498  0.153156   
...         ...       ...       ...       ...       ...       ...       ...   
39769  1.127618 -0.224268  0.321665 -0.629662  0.337780  0.692904  0.198732   
39770 -0.572114  0.109697  0.257995  1.117852  0.099366  0.394349 -0.163617   
39771 -0.604680  0.307465  0.433276  0.497503  0.069914  0.518798 -0.510497   
39772  0.352936  1.131235  0.360329 -0.672114  0.275627  0.570124  0.518016   
39773  0.765485  0.219677  0.161879 -0.892248  0.255787 -0.174911 -0.323245   

            7         8         9    ...       502 

In [18]:
#transform cuisine
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(df_shuffle_train.cuisine)

df_shuffle_train.cuisine = le.transform(df_shuffle_train.cuisine)
print(df_shuffle_train.cuisine)

0         9
1         6
2        12
3        13
4         7
         ..
29769     5
29770    11
29771     9
29772    11
29773     9
Name: cuisine, Length: 29774, dtype: int32


In [21]:
# train_test_split
from sklearn.model_selection import train_test_split

df_shuffle_train_pca = df_mix.iloc[:29774, :]
df_shuffle_test_pca = df_mix.iloc[29773:-1, :]
print(df_shuffle_train_pca)
print(df_shuffle_test_pca)

train_x,test_x,train_y,test_y = train_test_split(df_shuffle_train_pca.drop(['cuisine'],axis = 1), df_shuffle_train.cuisine, random_state=42, test_size=0.30)
print(len(train_x))
print()
print(len(test_x))

        cuisine  (    oz.) tomato sauce  (   oz.) tomato paste  \
0       italian                       0                    0.0   
1         greek                       0                    0.0   
2        korean                       0                    0.0   
3       mexican                       0                    0.0   
4        indian                       0                    0.0   
...         ...                     ...                    ...   
29769    french                       0                    0.0   
29770  japanese                       0                    0.0   
29771   italian                       0                    0.0   
29772  japanese                       0                    0.0   
29773   italian                       0                    0.0   

       (10 oz.) frozen chopped spinach  \
0                                  0.0   
1                                  0.0   
2                                  0.0   
3                                  0.0 

In [8]:
# validation--> SVM
from sklearn.svm import SVC

svm_clf_lin = SVC(random_state=42,kernel='linear')
svm_clf_poly = SVC(random_state=42,kernel='poly')
svm_clf_rbf = SVC(random_state=42,kernel='rbf')
svm_clf_sigmoid = SVC(random_state=42,kernel='sigmoid')

svm_clf_lin = svm_clf_lin.fit(train_x,train_y)
svm_clf_poly = svm_clf_poly.fit(train_x,train_y)
svm_clf_rbf = svm_clf_rbf.fit(train_x,train_y)
svm_clf_sigmoid = svm_clf_sigmoid.fit(train_x,train_y)

In [None]:
pred = []
pred.append(svm_clf_lin.predict(test_x))
pred.append(svm_clf_poly.predict(test_x))
pred.append(svm_clf_rbf.predict(test_x))
pred.append(svm_clf_sigmoid.predict(test_x))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

for i in range(len(pred)):
    print(i)
    fig, ax = plt.subplots(figsize=(15,15))
    ax = sns.heatmap(confusion_matrix(test_y, pred[i]),annot=True,cmap='coolwarm',fmt='d')
    print("recall: ", recall_score(test_y, pred[i], average='micro'))
    print("precision: ", precision_score(test_y, pred[i], average='macro'))
    print("accuracy: ", accuracy_score(test_y, pred[i]))
    print()
    plt.savefig(str(i))
    

In [None]:
submit = le.inverse_transform(pd.Series(pred[2]))
submit = pd.concat([pd.Series(df_shuffle_test.id, name='Id'), pd.Series(submit, name='Category')], axis=1)
print(submit.to_csv(index=False))
submit.to_csv("submission.csv", index = False)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

train_x = train_x.astype("float32")
train_y = train_y.astype("float32")
test_x = test_x.astype("float32")
test_y = test_y.astype("float32")
# Reserve 8933 samples for validation
val_x = train_x[-8933:]
val_y = train_y[-8933:]
train_x = train_x[:-8933]
train_y = train_y[:-8933]

In [None]:
n = ["Adadelta","Adagrad", "Adam", "Ftrl", "SGD", "RMSprop", "Nadam"]
m = [tf.keras.optimizers.Adadelta(0.01), tf.keras.optimizers.Adagrad(0.01), tf.keras.optimizers.Adam(0.001), tf.keras.optimizers.Ftrl(0.01), tf.keras.optimizers.SGD(0.01), tf.keras.optimizers.RMSprop(0.01), tf.keras.optimizers.Nadam(0.01)]

for i in range(len(m)):
    model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(512),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dense(20)
    ])

    model.compile(
        optimizer=m[i],
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
    )

    model.fit(
        train_x.values, train_y.values,
        epochs=20,
        validation_data=(val_x.values, val_y.values)
    )

    # model.summary()
    y = model.predict(test_x.values)
    pred = []
    for j in range(len(y)):
        pred.append(np.argmax(y[j]))
    # print(pred)

    fig, ax = plt.subplots(figsize=(15,15))
    ax = sns.heatmap(confusion_matrix(test_y, pred),annot=True,cmap='coolwarm',fmt='d')
    print(n[i])
    print("recall: ", recall_score(test_y, pred, average='micro'))
    print("precision: ", precision_score(test_y, pred, average='macro'))
    print("accuracy: ", accuracy_score(test_y, pred))
    print()
    plt.savefig('EMG {0}.jpg'.format(i)) 


In [None]:
submit_nn = le.inverse_transform(pd.Series(pred[2]))
submit_nn = pd.concat([pd.Series(df_shuffle_test.id, name='Id'), pd.Series(submit_nn, name='Category')], axis=1)
print(submit_nn)
submit_nn.to_csv("submission_nn.csv", index = False)