# Model Analysis

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import ast
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, concatenate, Embedding, Flatten
from tensorflow.keras.models import Model, load_model
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 

2024-10-23 11:24:45.742869: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### Import the model

In [2]:
# Import the saved model
model = load_model('../Models/model_binary_crossentropy1.keras')

#### Import the data and prepare the test data

In [3]:
# Import the input data
merged_df = pd.read_csv('../Data/Preprocessed/NN_Input_Data.csv', index_col=0)

In [4]:
# Inspect df to make sure it imported correctly
merged_df.head()

Unnamed: 0,user_id,recipe_id,rating,minutes,contributor_id,tags,n_steps,ingredients,n_ingredients,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,38094,40893,4,495,1533,weeknight time-to-make course main-ingredient ...,4,great northern beans yellow onion diced green ...,9,204.8,5.0,9.0,26.0,24.0,2.0,10.0
1,1293707,40893,5,495,1533,weeknight time-to-make course main-ingredient ...,4,great northern beans yellow onion diced green ...,9,204.8,5.0,9.0,26.0,24.0,2.0,10.0
2,8937,44394,4,20,56824,30-minutes-or-less time-to-make course main-in...,5,devil's food cake mix vegetable oil eggs reese...,4,132.3,11.0,39.0,5.0,4.0,11.0,5.0
3,126440,85009,5,10,64342,15-minutes-or-less time-to-make course main-in...,3,mayonnaise salsa cheddar cheese refried beans ...,13,2786.2,342.0,134.0,290.0,161.0,301.0,42.0
4,57222,85009,5,10,64342,15-minutes-or-less time-to-make course main-in...,3,mayonnaise salsa cheddar cheese refried beans ...,13,2786.2,342.0,134.0,290.0,161.0,301.0,42.0


In [27]:
# Compute baseline accuracy needed for improvement.  If we were to predict all recipes would be rated 5, what accuracy would be achieved?
np.sum((merged_df['rating'] == 5)) / len(merged_df['rating'])

0.7209358803285507

In [5]:
# Designate input vs target features
X = merged_df[['user_id', 'recipe_id', 'minutes', 'contributor_id', 'tags',
       'n_steps', 'ingredients', 'n_ingredients', 'calories', 'total_fat',
       'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']]

y = merged_df['rating']

# Split the df into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [6]:
# Use this to create a binary rating column
# if 'rating' == 5, then binary will =1, otherwise binary = 0

merged_df['binary_rating'] = (merged_df['rating']==5).astype('int')

y_binary = merged_df['binary_rating']

# Split the df into train and test sets
X_train, X_test, y_train_b, y_test_b = train_test_split(X, y_binary, test_size=0.3, random_state=123)

In [7]:
# Make sure tags and ingredients columns are all strings
X_train['tags'] = X_train['tags'].astype(str)
X_test['tags'] = X_test['tags'].astype(str)
X_train['ingredients'] = X_train['ingredients'].astype(str)
X_test['ingredients'] = X_test['ingredients'].astype(str)

In [8]:
# Now let's tokenize the tags and ingredients columns

# Tokenization for tags
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(X_train['tags'])
tag_train_sequences = tag_tokenizer.texts_to_sequences(X_train['tags'])
tag_test_sequences = tag_tokenizer.texts_to_sequences(X_test['tags'])

# Tokenization for ingredients
ingredient_tokenizer = Tokenizer()
ingredient_tokenizer.fit_on_texts(X_train['ingredients'])
ingredient_train_sequences = ingredient_tokenizer.texts_to_sequences(X_train['ingredients'])
ingredient_test_sequences = ingredient_tokenizer.texts_to_sequences(X_test['ingredients'])

# Now I'll pad the sequences to ensure uniform input size

max_tag_length = max(len(seq) for seq in tag_train_sequences)
max_ingredient_length = max(len(seq) for seq in ingredient_train_sequences)

tag_train_padded = pad_sequences(tag_train_sequences, maxlen=max_tag_length, padding='post')
tag_test_padded = pad_sequences(tag_test_sequences, maxlen=max_tag_length, padding='post')

ingredient_train_padded = pad_sequences(ingredient_train_sequences, maxlen=max_ingredient_length, padding='post')
ingredient_test_padded = pad_sequences(ingredient_test_sequences, maxlen=max_ingredient_length, padding='post')

In [9]:
# I'm going to renumber the user_id, recipe_id, and contributor_id starting from 0 to reduce the overhead of the following steps
user_id_mapping = {old_id: new_id for new_id, old_id in enumerate(X['user_id'].unique())}
X_train['user_id'] = X_train['user_id'].map(user_id_mapping)
X_test['user_id'] = X_test['user_id'].map(user_id_mapping)

recipe_id_mapping = {old_id: new_id for new_id, old_id in enumerate(X['recipe_id'].unique())}
X_train['recipe_id'] = X_train['recipe_id'].map(recipe_id_mapping)
X_test['recipe_id'] = X_test['recipe_id'].map(recipe_id_mapping)

contributor_id_mapping = {old_id: new_id for new_id, old_id in enumerate(X['contributor_id'].unique())}
X_train['contributor_id'] = X_train['contributor_id'].map(contributor_id_mapping)
X_test['contributor_id'] = X_test['contributor_id'].map(contributor_id_mapping)

In [10]:
# Separate inputs for user_id, recipe_id, nutrition, and prep
X_user_test = X_test['user_id'].values
X_recipe_test = X_test['recipe_id'].values
X_contributor_test = X_test['contributor_id'].values
X_nutrition_test = X_test[['calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']].values
X_prep_test = X_test[['n_steps', 'n_ingredients']].values

#### Evaluate the model

In [11]:
model.evaluate([X_user_test, X_recipe_test, X_contributor_test, X_nutrition_test, X_prep_test, tag_test_padded, ingredient_test_padded], y_test_b)

[1m10616/10616[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.7089 - loss: 0.5794


[0.5786146521568298, 0.7097886204719543]

#### Import the other models and evaluate them

In [12]:
model2 = load_model('../Models/model_crossentropy4.keras')
model3 = load_model('../Models/model_crossentropy6.keras')

In [13]:
model2.evaluate([X_user_test, X_recipe_test, X_contributor_test, X_nutrition_test, X_prep_test, tag_test_padded, ingredient_test_padded], y_test)

[1m10616/10616[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.6916 - loss: 0.8968


[0.8947456479072571, 0.6924238801002502]

In [14]:
model3.evaluate([X_user_test, X_recipe_test, X_contributor_test, X_prep_test, tag_test_padded, ingredient_test_padded], y_test)

[1m10616/10616[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.6636 - loss: 0.9174


[0.9162075519561768, 0.6645030379295349]

#### Plot loss and accuracy metrics from each model

In [15]:
# # Import the history from each model
# crossentropy1_df = pd.read_csv('../Models/Model_Metrics/model_crossentropy1_history.csv', index_col=0)
# crossentropy2_df = pd.read_csv('../Models/Model_Metrics/model_crossentropy2_history.csv', index_col=0)
# crossentropy3_df = pd.read_csv('../Models/Model_Metrics/model_crossentropy3_history.csv', index_col=0)

In [16]:
# # Merge into a single df
# merged_history = pd.concat([crossentropy1_df, crossentropy2_df, crossentropy3_df], keys = ['model_crossentropy1', 'model_crossentropy2', 'model_crossentropy3'])
# merged_history.head()

In [17]:
# import matplotlib.lines as mlines

# # Plot accuracy vs. epoch for each model
# epochs = [merged_history.index[i][1] + 1 for i in range(0,len(merged_history))]
# model = [merged_history.index[i][0] for i in range(0,len(merged_history))]
# _ = sns.lineplot(data= merged_history, x = epochs, y = 'accuracy', hue = model)
# _ = sns.lineplot(data= merged_history, x = epochs, y = 'val_accuracy', hue = model, linestyle = '--', legend = False)
# plt.title('Training and Validation Accuracy')
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')

# # Create lines to add to the legend
# training_line = mlines.Line2D([], [], color='black', linestyle='-', label='Training')
# validation_line = mlines.Line2D([], [], color='black', linestyle='--', label='Validation')

# # Add labels to the legend
# handles, labels = plt.gca().get_legend_handles_labels()

# # Append the custom training and validation handles
# handles.extend([training_line, validation_line])
# labels.extend(['Training', 'Validation'])
# plt.legend(handles = handles, labels = labels)

# plt.show()

In [18]:
# # Plot loss function vs. epoch for each model
# epochs = [merged_history.index[i][1] + 1 for i in range(0,len(merged_history))]
# model = [merged_history.index[i][0] for i in range(0,len(merged_history))]

# _ = sns.lineplot(data= merged_history, x = epochs, y = 'loss', hue = model)
# _ = sns.lineplot(data= merged_history, x = epochs, y = 'val_loss', hue = model, linestyle = '--', legend = False)
# plt.title('Training and Validation Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')

# # Create lines to add to the legend
# training_line = mlines.Line2D([], [], color='black', linestyle='-', label='Training')
# validation_line = mlines.Line2D([], [], color='black', linestyle='--', label='Validation')

# # Add labels to the legend
# handles, labels = plt.gca().get_legend_handles_labels()

# # Append the custom training and validation handles
# handles.extend([training_line, validation_line])
# labels.extend(['Training', 'Validation'])
# plt.legend(handles = handles, labels = labels)

# plt.show()

#### Plot the predicted ratings vs. actual ratings

In [19]:
# y_pred = model.predict([X_user_test, X_recipe_test, X_contributor_test, X_nutrition_test, tag_test_padded, ingredient_test_padded])

In [20]:
# y_pred_rounded = np.round(y_pred).clip(1, 5)

In [21]:
# plt.scatter(y_test, y_pred, alpha=0.5)
# plt.xlabel('True Ratings')
# plt.ylabel('Predicted Ratings')
# plt.title('True vs. Predicted Ratings')
# plt.show()