In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo

In [2]:
train = pd.read_json("train.json")
test = pd.read_json("test.json")
test.head()

Unnamed: 0,id,ingredients
0,10210,"[barbecue sauce, yellow onion, prepared pizza ..."
1,2310,"[olive oil, stewed tomatoes, hot sauce, chicke..."
2,33213,"[chopped bell pepper, cilantro leaves, white o..."
3,16902,"[tomatoes, purple onion, cumin seed, red lenti..."
4,9056,"[chile powder, lime, blackpepper, ground clove..."


In [3]:
# onehot encoder for ingredients
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
expandedLabelData_train = mlb.fit_transform(train["ingredients"])
labelClasses = mlb.classes_
expandedLabelData_test = mlb.fit_transform(test["ingredients"])
labelClasses_2 = mlb.classes_

expandedLabels_train = pd.DataFrame(expandedLabelData_train, columns=labelClasses)
expandedLabels_test = pd.DataFrame(expandedLabelData_test, columns=labelClasses_2)
print(expandedLabels_train)
print()
print(expandedLabels_test)

       (    oz.) tomato sauce  (   oz.) tomato paste  \
0                           0                      0   
1                           0                      0   
2                           0                      0   
3                           0                      0   
4                           0                      0   
...                       ...                    ...   
29769                       0                      0   
29770                       0                      0   
29771                       0                      0   
29772                       0                      0   
29773                       0                      0   

       (10 oz.) frozen chopped spinach  \
0                                    0   
1                                    0   
2                                    0   
3                                    0   
4                                    0   
...                                ...   
29769                            

In [4]:
# shuffle data
from numpy import nan

def shuffle(df_origin): 
    df = df_origin.sample(frac=1,random_state=113)#,random_state=113
    df2= df.reset_index(drop=True)
    return df2

df = pd.DataFrame(train)
df = pd.concat([df,expandedLabels_train], axis = 1)
df_shuffle_train = shuffle(df)
print(df_shuffle_train)

df = pd.DataFrame(test)
df = pd.concat([df,expandedLabels_test], axis = 1)
df_shuffle_test = shuffle(df)
print(df_shuffle_test)

df_mix = pd.concat([df_shuffle_train, df_shuffle_test])
df_mix= df_mix.replace(nan, 0)
print(df_mix)


          id   cuisine                                        ingredients  \
0      45758   italian  [bread crumbs, ricotta cheese, fresh parsley, ...   
1       2065     greek  [large eggs, salt, dried oregano, plain yogurt...   
2        968    korean  [light brown sugar, dijon mustard, paprika, Go...   
3      30592   mexican  [shredded cheddar cheese, red bell pepper, gre...   
4      44223    indian  [jasmine rice, unsalted butter, grapeseed oil,...   
...      ...       ...                                                ...   
29769   3997    french  [cherries, all-purpose flour, sugar, refrigera...   
29770   4245  japanese  [avocado, crabmeat, sushi rice, nori, soy sauc...   
29771  36461   italian  [parmigiano reggiano cheese, crusty bread, par...   
29772  44094  japanese  [boneless chicken breast, salt, soy sauce, mus...   
29773  14160   italian  [tomato sauce, bacon, onions, butter, bow-tie ...   

       (    oz.) tomato sauce  (   oz.) tomato paste  \
0                  

In [5]:
#PCA for lower down dimensionality
from sklearn.decomposition import PCA
N_COMPONENTS = 2048
pca = PCA(N_COMPONENTS)
pca.fit(df_mix[df_mix.columns[3:]])
df_mix_pca = pd.DataFrame(pca.transform(df_mix[df_mix.columns[3:]]))
df_mix_pca.head()

print(df_mix_pca)

           0         1         2         3         4         5         6     \
0      0.510982  0.088518 -0.672753 -0.158376  0.414495  0.268091 -0.137284   
1      0.274644  0.375971 -0.176782  0.065640  0.075968 -0.475731  0.255792   
2     -0.307932  0.051774  0.877637 -0.059074  0.309773  0.580567 -0.154620   
3     -0.559968 -0.159165 -0.164936 -0.188891 -0.156836 -0.321363 -0.230882   
4     -0.312332 -0.461955  0.375438 -0.446578  0.049873 -0.273498  0.153156   
...         ...       ...       ...       ...       ...       ...       ...   
39769 -0.794928  0.016763  0.615479  0.982461  0.703543  0.028578  0.929613   
39770  0.417789  1.046598  0.052702  0.086276 -0.335018  0.499626 -0.514245   
39771  0.446145  0.307237 -0.438216  0.144716  0.617088  0.304539  0.038731   
39772 -0.515761 -0.015798  0.253565  0.321376 -0.101324  0.153866 -0.802781   
39773 -0.600770 -0.027741 -0.127006 -0.146289 -0.086928 -0.206495 -0.233840   

           7         8         9     ...      2038 

In [6]:
#transform cuisine
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(df_shuffle_train.cuisine)

df_shuffle_train.cuisine = le.transform(df_shuffle_train.cuisine)
print(df_shuffle_train.cuisine)
print(le.classes_)

0         9
1         6
2        12
3        13
4         7
         ..
29769     5
29770    11
29771     9
29772    11
29773     9
Name: cuisine, Length: 29774, dtype: int32
['brazilian' 'british' 'cajun_creole' 'chinese' 'filipino' 'french'
 'greek' 'indian' 'irish' 'italian' 'jamaican' 'japanese' 'korean'
 'mexican' 'moroccan' 'russian' 'southern_us' 'spanish' 'thai'
 'vietnamese']


In [7]:
# train_test_split
from sklearn.model_selection import train_test_split

df_shuffle_train_pca = df_mix_pca.iloc[:29774, :]
df_shuffle_test_pca = df_mix_pca.iloc[29774:39774, :]
print(df_shuffle_train_pca)
print(df_shuffle_test_pca)

train_x,test_x,train_y,test_y = train_test_split(df_shuffle_train_pca, df_shuffle_train.cuisine, random_state=42, test_size=0.30)
print(len(train_x))
print()
print(len(test_x))
"""

df_shuffle_train_pca = df_mix.iloc[:29774, :].drop(['ingredients', 'id', 'cuisine'], axis=1)
df_shuffle_test_pca = df_mix.iloc[29774:39774, :].drop(['ingredients', 'id', 'cuisine'],axis=1)
print(df_shuffle_train_pca)
print(df_shuffle_test_pca)

train_x,test_x,train_y,test_y = train_test_split(df_shuffle_train_pca, df_shuffle_train.cuisine, random_state=42, test_size=0.30)
print(len(train_x))
print()
print(len(test_x))"""

           0         1         2         3         4         5         6     \
0      0.510982  0.088518 -0.672753 -0.158376  0.414495  0.268091 -0.137284   
1      0.274644  0.375971 -0.176782  0.065640  0.075968 -0.475731  0.255792   
2     -0.307932  0.051774  0.877637 -0.059074  0.309773  0.580567 -0.154620   
3     -0.559968 -0.159165 -0.164936 -0.188891 -0.156836 -0.321363 -0.230882   
4     -0.312332 -0.461955  0.375438 -0.446578  0.049873 -0.273498  0.153156   
...         ...       ...       ...       ...       ...       ...       ...   
29769 -0.665545  0.920214 -0.063421 -0.161680 -0.131944  0.567556  0.401429   
29770 -0.671867 -0.091928  0.172037 -0.045338  0.193025 -0.299653 -0.056879   
29771 -0.565840  0.085216 -0.128558 -0.272880 -0.135283 -0.122996 -0.250916   
29772  0.495289  0.003097  0.264597  0.721648 -0.450540 -0.299760  0.982927   
29773  0.664425  0.181223 -0.013030 -0.299707 -0.744751 -0.018418 -0.042312   

           7         8         9     ...      2038 

"\n\ndf_shuffle_train_pca = df_mix.iloc[:29774, :].drop(['ingredients', 'id', 'cuisine'], axis=1)\ndf_shuffle_test_pca = df_mix.iloc[29774:39774, :].drop(['ingredients', 'id', 'cuisine'],axis=1)\nprint(df_shuffle_train_pca)\nprint(df_shuffle_test_pca)\n\ntrain_x,test_x,train_y,test_y = train_test_split(df_shuffle_train_pca, df_shuffle_train.cuisine, random_state=42, test_size=0.30)\nprint(len(train_x))\nprint()\nprint(len(test_x))"

In [28]:
# validation--> SVM
from sklearn.svm import SVC

svm_clf_lin = SVC(random_state=42,kernel='linear')
svm_clf_poly = SVC(random_state=42,kernel='poly')
svm_clf_rbf = SVC(random_state=42,kernel='rbf')
svm_clf_sigmoid = SVC(random_state=42,kernel='sigmoid')

svm_clf_lin = svm_clf_lin.fit(train_x,train_y)
print('1')
svm_clf_poly = svm_clf_poly.fit(train_x,train_y)
print('2')
svm_clf_rbf = svm_clf_rbf.fit(train_x,train_y)
print('3')
svm_clf_sigmoid = svm_clf_sigmoid.fit(train_x,train_y)

1
2
3


In [20]:
pred = []
pred.append(svm_clf_lin.predict(test_x))
print('1')
pred.append(svm_clf_poly.predict(test_x))
print('2')
pred.append(svm_clf_rbf.predict(test_x))
print('3')
pred.append(svm_clf_sigmoid.predict(test_x))

1
2
3


In [11]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

for i in range(len(pred)):
    print(i)
    fig, ax = plt.subplots(figsize=(15,15))
    ax = sns.heatmap(confusion_matrix(test_y, pred[i]),annot=True,cmap='coolwarm',fmt='d')
    print("recall: ", recall_score(test_y, pred[i], average='micro'))
    print("precision: ", precision_score(test_y, pred[i], average='macro'))
    print("accuracy: ", accuracy_score(test_y, pred[i]))
    print()
    plt.savefig(str(i))
    

0


TypeError: Singleton array 13 cannot be considered a valid collection.

In [18]:
sub = svm_clf_lin.predict(df_shuffle_test_pca)

In [19]:
submit = le.inverse_transform(pd.Series(sub))
submit = pd.concat([pd.Series(df_shuffle_test.id, name='Id'), pd.Series(submit, name='Category')], axis=1)
print(submit.to_csv(index=False))
submit.to_csv("submission.csv", index = False)


Id,Category
9356,italian
31575,southern_us
3603,southern_us
5099,brazilian
13393,mexican
33105,thai
18363,italian
10466,southern_us
25480,italian
4444,southern_us
11715,mexican
5766,moroccan
39856,filipino
6548,cajun_creole
23756,french
7334,southern_us
42644,chinese
47978,jamaican
10103,spanish
30801,mexican
6729,irish
42628,mexican
20048,cajun_creole
8633,italian
20666,mexican
30385,southern_us
27628,italian
24392,southern_us
37299,mexican
31384,japanese
35328,southern_us
40134,filipino
3808,mexican
10098,southern_us
31743,southern_us
30460,russian
13492,korean
10293,chinese
48993,mexican
26671,italian
13373,southern_us
18795,italian
17332,italian
42217,mexican
18034,southern_us
39009,chinese
44699,cajun_creole
11109,italian
614,mexican
2930,southern_us
35566,italian
13585,thai
1528,mexican
15593,thai
32751,mexican
40295,chinese
478,cajun_creole
23240,mexican
10355,french
31805,indian
5047,southern_us
27299,french
28988,mexican
48956,southern_us
15679,mexican
32291,southern_us
34061,

In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

train_x = train_x.astype("float32")
train_y = train_y.astype("float32")
test_x = test_x.astype("float32")
test_y = test_y.astype("float32")
# Reserve 8933 samples for validation
val_x = train_x[-8933:]
val_y = train_y[-8933:]
train_x = train_x[:-8933]
train_y = train_y[:-8933]

In [9]:
n = ["Adadelta", "Adagrad", "Adam", "Nadam"]
m = [tf.keras.optimizers.Adadelta(0.01), tf.keras.optimizers.Adagrad(0.01), tf.keras.optimizers.Adam(0.01), tf.keras.optimizers.Nadam(0.01)]
model_all = []

for i in range(len(m)):
    model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(2048),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dense(20)
    ])

    model.compile(
        optimizer=m[i],
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
    )

    model.fit(
        train_x.values, train_y.values,
        epochs=60,
        validation_data=(val_x.values, val_y.values)
    )
    model_all.append(model)
    # model.summary()
    y = model.predict(test_x.values)
    pred = []
    for j in range(len(y)):
        pred.append(np.argmax(y[j]))
    # print(pred)

    fig, ax = plt.subplots(figsize=(15,15))
    ax = sns.heatmap(confusion_matrix(test_y, pred),annot=True,cmap='coolwarm',fmt='d')
    print(n[i])
    print("recall: ", recall_score(test_y, pred, average='micro'))
    print("precision: ", precision_score(test_y, pred, average='macro'))
    print("accuracy: ", accuracy_score(test_y, pred))
    print()
    plt.savefig('EMG {0}.jpg'.format(i)) 


Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


NameError: name 'confusion_matrix' is not defined

In [21]:
y = model_all[1].predict(df_shuffle_test_pca)
pred = []
for j in range(len(y)):
    pred.append(np.argmax(y[j]))

print(pred)


[9, 16, 16, 0, 13, 19, 9, 2, 9, 16, 13, 14, 4, 2, 5, 16, 3, 4, 17, 13, 8, 13, 2, 9, 13, 5, 9, 9, 4, 11, 16, 19, 13, 16, 16, 15, 19, 3, 13, 9, 16, 9, 9, 16, 16, 3, 15, 9, 13, 16, 9, 18, 13, 18, 13, 11, 2, 13, 5, 7, 8, 5, 13, 16, 13, 16, 2, 15, 3, 3, 16, 10, 16, 9, 18, 13, 9, 5, 6, 7, 16, 1, 16, 5, 9, 6, 10, 9, 13, 13, 19, 6, 13, 11, 13, 12, 7, 9, 2, 1, 13, 7, 9, 17, 13, 11, 16, 13, 0, 12, 13, 17, 15, 13, 9, 12, 13, 16, 16, 5, 3, 18, 16, 9, 16, 6, 9, 9, 7, 2, 11, 9, 13, 7, 13, 15, 13, 13, 9, 13, 13, 19, 14, 13, 8, 14, 9, 9, 9, 2, 5, 14, 5, 3, 3, 14, 11, 9, 9, 9, 12, 13, 16, 9, 9, 3, 2, 9, 18, 11, 8, 3, 7, 16, 3, 13, 6, 3, 1, 7, 0, 8, 6, 12, 16, 9, 9, 9, 12, 3, 9, 13, 2, 9, 13, 13, 18, 16, 2, 3, 5, 13, 13, 9, 7, 9, 15, 9, 13, 16, 9, 16, 9, 11, 16, 13, 3, 9, 8, 2, 13, 13, 9, 2, 3, 18, 3, 13, 18, 3, 16, 16, 9, 9, 15, 12, 18, 7, 9, 6, 9, 5, 13, 9, 9, 9, 9, 11, 9, 17, 9, 2, 16, 16, 9, 16, 9, 3, 17, 16, 9, 17, 18, 9, 6, 16, 16, 1, 9, 16, 7, 5, 2, 9, 17, 9, 9, 13, 7, 3, 13, 3, 6, 8, 3, 5, 9, 18

In [22]:
submit_nn = le.inverse_transform(pd.Series(pred))
submit_nn = pd.concat([pd.Series(df_shuffle_test.id, name='Id'), pd.Series(submit_nn, name='Category')], axis=1)
print(submit_nn)
submit_nn.to_csv("submission_nn.csv", index = False)

         Id     Category
0      9356      italian
1     31575  southern_us
2      3603  southern_us
3      5099    brazilian
4     13393      mexican
...     ...          ...
9995  24775      chinese
9996   7449      italian
9997  19211      mexican
9998   8966  southern_us
9999  29738      italian

[10000 rows x 2 columns]


In [8]:
# use talos for hyperparameters optimization
import talos
from talos.utils import lr_normalizer

In [49]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense

def cuisine_model(x_train, y_train, x_val, y_val, params):
    
    model = Sequential()                            
    model.add(Dense(params['first_neuron'],
                    input_shape=(2048,),
                    activation='relu'))
    
    model.add(Dropout(params['dropout']))
    model.add(Dense(20, activation=params['last_activation']))
    model.summary()
    model.compile(optimizer=params['optimizer'](lr=lr_normalizer(params['lr'], params['optimizer'])),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

    out = model.fit(x_train, y_train,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    verbose=0,
                    validation_data=(x_val, y_val))
    
    return out, model

In [47]:
# parameter tester
from tensorflow.keras.optimizers import Adadelta, Nadam, Adagrad
from tensorflow.keras.activations import softmax, hard_sigmoid
from tensorflow.keras.losses import categorical_crossentropy, logcosh
import tensorflow as tf
p = {'lr': (0.01, 10, 10),
     'first_neuron':[256, 128, 64, 32],
     'batch_size': [2, 3],
     'epochs': [40],
     'dropout': (0, 0.40, 10),
     'optimizer': [Adadelta, Nadam, Adagrad],
     'loss': ['categorical_crossentropy'],
     'last_activation': ['softmax', 'hard_sigmoid'],
     'weight_regulizer': [None]}

In [48]:
# scan
scan_object = talos.Scan(train_x.values,
                         train_y.values, 
                         params=p,
                         model=cuisine_model,
                         experiment_name='cuisine',
                         fraction_limit=.001)









  0%|          | 0/4 [00:00<?, ?it/s][A[A[A[A[A[AModel: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 256)               524544    
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 20)                5140      
Total params: 529,684
Trainable params: 529,684
Non-trainable params: 0
_________________________________________________________________


ValueError: in user code:

    C:\Users\amy58\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:1224 test_function  *
        return step_function(self, iterator)
    C:\Users\amy58\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:1215 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\amy58\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\amy58\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\amy58\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\amy58\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:1208 run_step  **
        outputs = model.test_step(data)
    C:\Users\amy58\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:1174 test_step
        y_pred = self(x, training=False)
    C:\Users\amy58\anaconda3\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:976 __call__
        self.name)
    C:\Users\amy58\anaconda3\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:158 assert_input_compatibility
        ' input tensors. Inputs received: ' + str(inputs))

    ValueError: Layer sequential_6 expects 1 inputs, but it received 2 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 2048) dtype=float32>, <tf.Tensor 'ExpandDims:0' shape=(None, 1) dtype=int32>]


In [None]:
# accessing the results data frame
scan_object.data.head()

# accessing epoch entropy values for each round
scan_object.learning_entropy

# access the summary details
scan_object.details

In [None]:
# accessing the saved models
scan_object.saved_models

# accessing the saved weights for models
scan_object.saved_weights

In [None]:
# use Scan object as input
analyze_object = talos.Analyze(scan_object)

In [None]:
# access the dataframe with the results
analyze_object.data

In [None]:
# get the number of rounds in the Scan
analyze_object.rounds()

# get the highest result for any metric
analyze_object.high('val_acc')

# get the round with the best result
analyze_object.rounds2high('val_acc')

# get the best paramaters
analyze_object.best_params('val_acc', ['acc', 'loss', 'val_loss'])

# get correlation for hyperparameters against a metric
analyze_object.correlate('val_loss', ['acc', 'loss', 'val_loss'])

In [None]:
# a regression plot for two dimensions 
analyze_object.plot_regs('val_acc', 'val_loss')

# line plot
analyze_object.plot_line('val_acc')

# up to two dimensional kernel density estimator
analyze_object.plot_kde('val_acc')

# a simple histogram
analyze_object.plot_hist('val_acc', bins=50)

# heatmap correlation
analyze_object.plot_corr('val_loss', ['acc', 'loss', 'val_loss'])

# a four dimensional bar grid
analyze_object.plot_bars('batch_size', 'val_acc', 'first_neuron', 'lr')

In [None]:
talos.Deploy(scan_object=scan_object, model_name='cuisine_deploy', metric='val_acc')
iris = talos.Restore('cuisine_deploy.zip')

In [None]:
# make predictions with the model
iris.model.predict(test_x)
# get the meta-data for the experiment
iris.details

In [None]:
# get the hyperparameter space boundary
iris.params

In [None]:
# the results dataframe
iris.results