![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, OrdinalEncoder
import scipy.cluster.hierarchy as sch
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
pd.get_option("display.max_columns")
from sklearn.impute import SimpleImputer

# Loading CSV File

In [None]:
recipes = pd.read_csv('../data/raw/recipes_one_clean.csv')

In [None]:
recipes.head()

# Clustering

## Clustering Time + Difficulty + Cost

In [None]:
label_encoder = LabelEncoder()
recipes['cost_encoded'] = label_encoder.fit_transform(recipes['cost'])
recipes['time_encoded'] = label_encoder.fit_transform(recipes['time(min)'])
recipes['difficulty_encoded'] = label_encoder.fit_transform(recipes['difficulty'])
recipes['meal_class_encoded'] = label_encoder.fit_transform(recipes['meal_class'])

scaler = StandardScaler()
recipes[['cost_encoded', 'time_encoded', 'difficulty_encoded', 'meal_class_encoded']] = scaler.fit_transform(recipes[['cost_encoded', 'time_encoded', 'difficulty_encoded', 'meal_class_encoded']])

# Save the StandardScaler for scaling the features
with open('../scalers/cluster_scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# Save the LabelEncoder for categorical features
with open('../encoders/encoder.pkl', 'wb') as label_encoder_file:
    pickle.dump(label_encoder, label_encoder_file)

In [None]:
# Define the features you want to use for clustering
features = ['time_encoded', 'difficulty_encoded', 'cost_encoded', 'meal_class_encoded']

# Initialize a dictionary to store silhouette scores for each feature
silhouette_scores = {}

cluster_range = range(2, 20)

# Loop through each feature and each number of clusters
for feature in features:
    silhouette_scores[feature] = []
    
    for num_clusters in cluster_range:
        kmeans = KMeans(n_clusters=num_clusters, random_state=24, n_init=50)  
        recipes['cluster'] = kmeans.fit_predict(recipes[[feature]])
        silhouette = silhouette_score(recipes[[feature]], recipes['cluster'])
        silhouette_scores[feature].append(silhouette)

# Plot the silhouette scores for each feature
plt.figure(figsize=(8, 6))

for feature in features:
    plt.plot(cluster_range, silhouette_scores[feature], marker='o', linestyle='-', label=feature)

plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal Cluster Number')
plt.legend()
plt.grid(True)
plt.show()

# Choose the number of clusters with the highest silhouette score for each feature
optimal_num_clusters = {}

for feature in features:
    optimal_num_clusters[feature] = np.argmax(silhouette_scores[feature]) + 2

# Print the optimal number of clusters for each feature
print("Optimal Number of Clusters (Time Labels):", optimal_num_clusters['time_encoded'])
print("Optimal Number of Clusters (Difficulty):", optimal_num_clusters['difficulty_encoded'])
print("Optimal Number of Clusters (Cost):", optimal_num_clusters['cost_encoded'])
print("Optimal Number of Clusters (Meal Class):", optimal_num_clusters['meal_class_encoded'])

## Agglomerative Clustering

In [None]:
# Encode the 'meal_class' feature using one-hot encoding
encoder = OneHotEncoder(sparse=False)  # Use sparse=False to get a dense array
meal_class_encoded = encoder.fit_transform(recipes[['meal_class']])

# Perform hierarchical clustering with the encoded 'meal_class'
num_clusters = optimal_num_clusters['meal_class_encoded']
clustering = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
meal_class_clusters = clustering.fit_predict(meal_class_encoded)

# Combine the cluster labels for 'meal_class' with other features
recipes['combined_clusters'] = recipes['time_encoded'].astype(str) + "_" + recipes['difficulty_encoded'].astype(str) + "_" + recipes['cost_encoded'].astype(str) + "_" + meal_class_clusters.astype(str)

# Remove the trailing underscore
recipes['combined_clusters'] = recipes['combined_clusters'].str.rstrip('_')

# Create a dictionary to map combined cluster labels to unique integers
cluster_label_to_int = {label: idx for idx, label in enumerate(recipes['combined_clusters'].unique())}

# Map the combined cluster labels to integers and create a new column
recipes['combined_clusters_int'] = recipes['combined_clusters'].map(cluster_label_to_int)

# Save the encoder to a file using pickle
encoder_filename = '../encoders/onehot_encoder_agglomerative.pkl'
with open(encoder_filename, 'wb') as encoder_file:
    pickle.dump(encoder, encoder_file)

In [None]:
recipes

In [None]:
num_unique_clusters = recipes['combined_clusters_int'].nunique()

print("Number of unique combined integer clusters:", num_unique_clusters)

# Encoding for Cosine Similarity Inside Each Meal Class

## Vectorizing and Finding Cosine Similarity in Ingredients Combined

In [None]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Transform the ingredients_combined column into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(recipes['ingredients_combined'])

# Create a new column 'ingredients_cosine' with NaN values
recipes['ingredients_cosine'] = pd.Series(dtype='float64')

# Calculate cosine similarity between recipes and the average vector in the same meal class
for meal_class in recipes['meal_class'].unique():
    indices = recipes[recipes['meal_class'] == meal_class].index
    avg_tfidf_vector = tfidf_matrix[indices].mean(axis=0)
    
    # Convert tfidf_matrix to a numpy array
    tfidf_matrix_array = tfidf_matrix.toarray()
    
    # Convert avg_tfidf_vector to a numpy array
    avg_tfidf_vector_array = np.asarray(avg_tfidf_vector).reshape(1, -1)
    
    # Calculate cosine similarities
    cosine_similarities = cosine_similarity(tfidf_matrix_array[indices], avg_tfidf_vector_array)
    
    # Fill the 'ingredients_cosine' column with the cosine similarity values
    for i, index_i in enumerate(indices):
        recipes.at[index_i, 'ingredients_cosine'] = cosine_similarities[i][0]

recipes.columns

## Vectorizing and Finding Cosine Similarity in Preparations

In [None]:
# Transform the preparations column into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(recipes['preparations'])

# Create a new column 'preparations_cosine' with NaN values
recipes['preparations_cosine'] = pd.Series(dtype='float64')

# Calculate cosine similarity between recipes and the average vector in the same meal class
for meal_class in recipes['meal_class'].unique():
    indices = recipes[recipes['meal_class'] == meal_class].index
    avg_tfidf_vector = tfidf_matrix[indices].mean(axis=0)
    
    # Convert tfidf_matrix to a numpy array
    tfidf_matrix_array = tfidf_matrix.toarray()
    
    # Convert avg_tfidf_vector to a numpy array
    avg_tfidf_vector_array = np.asarray(avg_tfidf_vector).reshape(1, -1)
    
    # Calculate cosine similarities
    cosine_similarities = cosine_similarity(tfidf_matrix_array[indices], avg_tfidf_vector_array)
    
    # Fill the 'preparations_cosine' column with the cosine similarity values
    for i, index_i in enumerate(indices):
        recipes.at[index_i, 'preparations_cosine'] = cosine_similarities[i][0]

recipes.columns

## Scaling and Finding Cosine Similarity in Time(Min)

In [None]:
# Step 1: Calculate the average 'time(min)' for each meal class
avg_time_by_class = recipes.groupby('meal_class')['time(min)'].mean().reset_index()

# Step 2: Standardize 'time(min)' using StandardScaler
scaler = StandardScaler()
recipes['time(min)_scaled'] = scaler.fit_transform(recipes[['time(min)']])

# Create a DataFrame to store cosine similarity values
cosine_similarity_df = pd.DataFrame()

# Calculate cosine similarity for each recipe with respect to its meal class average
cosine_similarity_values = []

for index, row in recipes.iterrows():
    meal_class = row['meal_class']
    avg_time = avg_time_by_class[avg_time_by_class['meal_class'] == meal_class]['time(min)'].values[0]

    # Calculate cosine similarity for the recipe
    cosine_sim = cosine_similarity(
        [[row['time(min)_scaled']]],
        [[avg_time]])  # Use the standardized average time

    cosine_similarity_values.append(cosine_sim[0][0])

# Add cosine similarity values to the DataFrame
cosine_similarity_df['cosine_sim'] = cosine_similarity_values

# Add cosine similarity values to the main 'recipes' DataFrame
recipes['time_cosine'] = cosine_similarity_df['cosine_sim']

recipes.drop(columns=['time(min)_scaled'], inplace=True)

# Save the scalers using pickle
with open('../scalers/scalers_time.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

recipes.columns

## Scaling and Finding Cosine Similarity in Cost

In [None]:
# Define the mapping
cost_mapping = {'Económico': 1, 'Médio': 2, 'Dispendioso': 3}

# Use the map function to create the 'cost_mapped' column
recipes['cost_mapped'] = recipes['cost'].map(cost_mapping)
recipes['cost_mapped'] = recipes['cost_mapped'].astype(int)

# Step 1: Calculate the average 'cost_mapped' for each meal class
avg_cost_by_class = recipes.groupby('meal_class')['cost_mapped'].mean().reset_index()

# Step 2: Calculate cosine similarity
# Standardize 'cost_mapped' using StandardScaler to avoid issues with different scales
scaler = StandardScaler()
recipes['cost_scaled'] = scaler.fit_transform(recipes[['cost_mapped']])

# Calculate cosine similarity for each recipe within its meal class
def cosine_similarity_to_avg(row):
    meal_class = row['meal_class']
    avg_cost = avg_cost_by_class[avg_cost_by_class['meal_class'] == meal_class]['cost_mapped'].values[0]
    cosine_sim = cosine_similarity([[row['cost_scaled']]], [[avg_cost]])[0][0]
    return cosine_sim

recipes['cost_cosine'] = recipes.apply(cosine_similarity_to_avg, axis=1)

# Drop the columns that were used for calculations but are no longer needed
columns_to_drop = ['cost_mapped', 'cost_scaled']
recipes.drop(columns=columns_to_drop, inplace=True)

# Save the scalers using pickle
with open('../scalers/scalers_cost.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

recipes.columns

## Scaling and Finding Cosine Similarity in Servings

In [None]:
# Step 1: Calculate the average 'servings' for each meal class
avg_servings_by_class = recipes.groupby('meal_class')['servings'].mean().reset_index()

# Step 2: Calculate cosine similarity for 'servings' within each meal class
def cosine_similarity_to_avg_servings(row):
    meal_class = row['meal_class']
    avg_servings = avg_servings_by_class[avg_servings_by_class['meal_class'] == meal_class]['servings'].values[0]
    cosine_sim = cosine_similarity([[row['servings']]], [[avg_servings]])[0][0]
    return cosine_sim

# Standardize 'servings' using StandardScaler to avoid issues with different scales
scaler = StandardScaler()
recipes['servings_scaled'] = scaler.fit_transform(recipes[['servings']])

# Calculate cosine similarity for each recipe within its meal class and create a new column
recipes['servings_cosine'] = recipes.apply(cosine_similarity_to_avg_servings, axis=1)

# Drop the 'servings_scaled' column as it's no longer needed
recipes.drop(columns=['servings_scaled'], inplace=True)

# Save the scaler using pickle
with open('../scalers/scalers_servings.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

recipes.columns

## Scaling and Finding Cosine Similarity in Rating

In [None]:
# Step 1: Calculate the average 'rating' for each meal class
avg_rating_by_class = recipes.groupby('meal_class')['rating'].mean().reset_index()

# Step 2: Calculate cosine similarity for 'rating' within each meal class
def cosine_similarity_to_avg_rating(row):
    meal_class = row['meal_class']
    avg_rating = avg_rating_by_class[avg_rating_by_class['meal_class'] == meal_class]['rating'].values[0]
    cosine_sim = cosine_similarity([[row['rating']]], [[avg_rating]])[0][0]
    return cosine_sim

# Standardize 'rating' using StandardScaler to avoid issues with different scales
scaler = StandardScaler()
recipes['rating_scaled'] = scaler.fit_transform(recipes[['rating']])

# Calculate cosine similarity for each recipe within its meal class and create a new column
recipes['rating_cosine'] = recipes.apply(cosine_similarity_to_avg_rating, axis=1)

# Drop the 'rating_scaled' column as it's no longer needed
recipes.drop(columns=['rating_scaled'], inplace=True)

# Save the scaler using pickle
with open('../scalers/scalers_rating.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

## Scaling and Finding Cosine Similarity in Difficulty

In [None]:
# Define the mapping
difficulty_mapping = {'Fácil': 1, 'Médio': 2, 'Difícil': 3}

# Use the map function to create the 'difficulty_mapped' column
recipes['difficulty_mapped'] = recipes['difficulty'].map(difficulty_mapping)

# Step 1: Calculate the average 'cost_mapped' for each meal class
avg_difficulty_by_class = recipes.groupby('meal_class')['difficulty_mapped'].mean().reset_index()

# Step 2: Calculate cosine similarity
# Standardize 'cost_mapped' using StandardScaler to avoid issues with different scales
scaler = StandardScaler()
recipes['difficulty_scaled'] = scaler.fit_transform(recipes[['difficulty_mapped']])

# Calculate cosine similarity for each recipe within its meal class
def cosine_similarity_to_avg(row):
    meal_class = row['meal_class']
    avg_difficulty = avg_difficulty_by_class[avg_difficulty_by_class['meal_class'] == meal_class]['difficulty_mapped'].values[0]
    cosine_sim = cosine_similarity([[row['difficulty_scaled']]], [[avg_difficulty]])[0][0]
    return cosine_sim

recipes['difficulty_cosine'] = recipes.apply(cosine_similarity_to_avg, axis=1)

# Drop the columns that were used for calculations but are no longer needed
columns_to_drop = ['difficulty_mapped', 'difficulty_scaled']
recipes.drop(columns=columns_to_drop, inplace=True)

# Save the scalers using pickle
with open('../scalers/scalers_difficulty.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

recipes.columns

# Creating a Column of the Avg of Each Row's Cosine Similarity against the Avg of its Meal CLass 

In [None]:
recipes

In [None]:
recipes['avg_cosine'] = recipes[['difficulty_cosine', 'rating_cosine', 'servings_cosine', 'difficulty_cosine', 'cost_cosine', 'time_cosine', 'preparations_cosine', 'ingredients_cosine']].mean(axis=1)

# Saving File

In [None]:
recipes.to_csv('../data/clean/recipes.csv', index = False)

In [None]:
recipes.columns

In [None]:
recipes