# Importing libraries and loading the data sets of the products

In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
import random
import datetime as dt
from sklearn.metrics.pairwise import cosine_similarity
import itertools
from sklearn.metrics import precision_score, recall_score
import ast
from google.colab import drive

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import glob
import re

In [2]:
path = '/content/gdrive/MyDrive/'
drive.mount('/content/gdrive')

all_articles = glob.glob(path + '*_filtered_grouped_data.csv')
print (len(all_articles))

Mounted at /content/gdrive
131


In [3]:
csv_files_test = []
for csv in all_articles:
  number = re.search(r'\d+', csv).group()
  csv_files_test.append(number)

print (len(csv_files_test))

131


In [4]:
number = csv_files_test[0]+"_"

filtered_description_emb = pd.read_csv(path + number + 'filtered_description_emb.csv', index_col=0)
filtered_description_emb2 = pd.read_csv(path + number + 'filtered_description_emb2.csv', index_col=0)
filtered_description_emb2_avg_vector = pd.read_csv(path + number + 'filtered_description_emb2_avg_vector.csv', index_col=0)

filtered_images_df = pd.read_csv(path + number + 'filtered_images_df.csv', index_col=0)
filtered_images_df2 = pd.read_csv(path + number + 'filtered_images_df2.csv', index_col=0)
filtered_images_df2_avg_vector = pd.read_csv(path + number + 'filtered_images_df2_avg_vector.csv', index_col=0)

filtered_processed_articles = pd.read_csv(path + number + 'filtered_processed_articles.csv', index_col=0)
filtered_processed_articles2 = pd.read_csv(path + number + 'filtered_processed_articles2.csv', index_col=0)
filtered_processed_articles2_avg_vector = pd.read_csv(path + number + 'filtered_processed_articles2_avg_vector.csv', index_col=0)

filtered_grouped_data = pd.read_csv(path + number + 'filtered_grouped_data.csv', index_col=0)

test_article_id = filtered_description_emb2.index[0]
test_article_id

717464001

In [5]:
"""
# To run the calculation for the average vector - it is required to switch between the above files + change the name of the exported file
filtered_description_emb2 = filtered_description_emb2_avg_vector.copy()
filtered_images_df2 = filtered_images_df2_avg_vector.copy()
filtered_processed_articles2 = filtered_processed_articles2_avg_vector.copy()
"""

'\n# To run the calculation for the average vector - it is required to switch between the above files + change the name of the exported file\nfiltered_description_emb2 = filtered_description_emb2_avg_vector.copy()\nfiltered_images_df2 = filtered_images_df2_avg_vector.copy()\nfiltered_processed_articles2 = filtered_processed_articles2_avg_vector.copy()\n'

# Calculating the similarity of each test product with all the products in the training data - for the 3 dimensions and calculating the average similarity

In [6]:
#Finding the similarity of the description between each product and the test product

# Extract the single vector from filtered_description_emb2 (test vector)
vec2 = filtered_description_emb2.iloc[0].values

# Define a function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Compute cosine similarity for each row in filtered_description_emb
similarities_description_emb = filtered_description_emb.apply(lambda row: cosine_similarity(row.values, vec2), axis=1)

#### scaler 0 to 1
min_value = similarities_description_emb.min()
max_value = similarities_description_emb.max()
similarities_description_emb = (similarities_description_emb - min_value) / (max_value - min_value)
####

# Display the similarities
similarities_description_emb.head()

Unnamed: 0_level_0,0
product_id,Unnamed: 1_level_1
108775015,0.625123
120129001,0.606231
123173001,0.451778
144993001,0.301675
146730001,0.375402


In [7]:
#Finding the similarity of the image between each product and the test product

# Extract the single vector from filtered_description_emb2
vec2 = filtered_images_df2.iloc[0].values

# Define a function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Compute cosine similarity for each row in filtered_description_emb
similarities_images_df = filtered_images_df.apply(lambda row: cosine_similarity(row.values, vec2), axis=1)

#### scaler
min_value = similarities_images_df.min()
max_value = similarities_images_df.max()
similarities_images_df = (similarities_images_df - min_value) / (max_value - min_value)
####

# Display the similarities
similarities_images_df.head()


Unnamed: 0_level_0,0
product_id,Unnamed: 1_level_1
377277001,0.799937
507909001,0.275926
578476001,0.479739
553611001,0.430853
680186001,0.443783


In [8]:
#Finding the similarity of the tabular between each product and the test product

# Extract the single vector from filtered_description_emb2
vec2 = filtered_processed_articles2.iloc[0].values

# Define a function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Compute cosine similarity for each row in filtered_description_emb
similarities_processed_articles = filtered_processed_articles.apply(lambda row: cosine_similarity(row.values, vec2), axis=1)

#### scaler
min_value = similarities_processed_articles.min()
max_value = similarities_processed_articles.max()
similarities_processed_articles = (similarities_processed_articles - min_value) / (max_value - min_value)
####


# Display the similarities
similarities_processed_articles.head()


Unnamed: 0_level_0,0
article_id,Unnamed: 1_level_1
108775015,0.75
120129001,0.25
123173001,0.25
144993001,0.25
146730001,0.25


In [9]:
# Creating a data frame with the image of each product for the test product in 3 dimensions and their average

# Create a DataFrame
df_desc_emb = pd.DataFrame(similarities_description_emb)
df_images = pd.DataFrame(similarities_images_df)
df_articles = pd.DataFrame(similarities_processed_articles)

# Rename columns for consistency
df_articles.index.name = 'product_id'

# Merge the dataframes on 'product_id'
df_merged = pd.merge(df_desc_emb, df_images, on='product_id', how='outer')
df_merged = pd.merge(df_merged, df_articles, on='product_id', how='outer')

df_merged = df_merged.rename(columns={'0_x': 'description_emb_similarity', '0_y': 'images_df_similarity', 0: 'processed_articles_similarity'})

# Calculate the average similarity
df_merged['average_similarity'] = df_merged[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']].mean(axis=1)

# Display the final dataframe
df_merged

Unnamed: 0_level_0,description_emb_similarity,images_df_similarity,processed_articles_similarity,average_similarity
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
108775015,0.625123,0.985692,0.75,0.786938
120129001,0.606231,0.346800,0.25,0.401010
123173001,0.451778,0.406332,0.25,0.369370
144993001,0.301675,0.375234,0.25,0.308970
146730001,0.375402,0.415122,0.25,0.346841
...,...,...,...,...
920700002,0.540820,0.569802,0.75,0.620207
921096003,0.462711,0.409901,0.25,0.374204
926825001,0.351678,0.486823,0.50,0.446167
927865001,0.725313,0.532240,0.50,0.585851


# Consolidation of the similarity of the products for each customer in 4 methods: average, median, 75th percentile and maximum of the basket of products for each customer

In [10]:
# Connecting the data of the customers and their unique products to whether the customer purchased the test product
df = pd.DataFrame(filtered_grouped_data).copy()
df['unique_article_ids'] = df['unique_article_ids'].apply(ast.literal_eval)

# Function to remove test_article_id from unique_article_ids and check existence
def remove_and_check(row):
      if test_article_id in row['unique_article_ids']:
          row['unique_article_ids'].remove(test_article_id)
          row['exist_test'] = 1
      else:
          row['exist_test'] = 0
      return row

# Apply the function to each row
df = df.apply(remove_and_check, axis=1)

# Check the updated DataFrame
df

Unnamed: 0,customer_id,unique_article_ids,exist_test
4883,0161b094e87ecd811ace003a01222068afe8393e93a46c...,"[586244001, 586273001, 585158001, 585130001, 5...",0
6477,01cc53c3fa779e7eea61e6fa87745a6707e4f87c9b3b8c...,"[715624001, 689005001, 817198001, 682238001, 6...",0
10376,02e3ff7c929f456e23ed040dcf8ecb96d839a8ab8e5c54...,"[579541001, 736581001, 747984001, 621048001, 7...",0
19107,05561c7f76c3b32876132742f9ba1241e46f85794ed3f6...,"[722436001, 764228001, 747939002, 702623001, 8...",0
36701,0a426bf0099f7a569705c1daa7caa9a127dce4e7d697c1...,"[889392001, 831429001, 836262001, 708489001, 8...",0
...,...,...,...
879686,f61ab1706f224d74314fc0f28f32acdcd312beaca7e58f...,"[805275001, 854301001, 797892001, 817353002, 8...",0
889076,f8bc7d9127afea242b1ada45cd11f72db30104809e5c28...,"[700835001, 565379001, 693479001, 572797001, 7...",1
905556,fd5681ce4ca1bedd35be242604fee3af97f4f69bf3d32f...,"[665509002, 504155001, 687921001, 691546002, 6...",1
911338,fef28dff7502ee38833fddc4ce2c74a31f8fd744b1ff2f...,"[589599001, 794575001, 762205001, 720125001, 8...",1


In [11]:
# Inspection of the amount of customers who purchased
sum_exist_test = df['exist_test'].sum()
sum_exist_test

50

In [12]:
#Calculation of the image of all the products purchased by a customer by dimension in 4 methods,
# according to the average, the median, the maximum and the 75th percentile of each customer's products

def calculate_similarity(customer_data, similarity_data, op):
    # Initialize lists to store the results
    customer_id_list = []
    description_emb_similarity_list = []
    images_df_similarity_list = []
    processed_articles_similarity_list = []
    average_similarity_list = []

    # Iterate over each customer in the customer_data DataFrame
    for index, row in customer_data.iterrows():
        customer_id = row['customer_id']  # Get the customer ID
        article_ids = row['unique_article_ids']  # Get the list of article IDs associated with the customer

        # Filter and compute the similarity metric based on the operation (mean, median, max, 75th percentile)
        if op == "mean":
            relevant_similarities = similarity_data.loc[article_ids].mean()
        elif op == "median":
            relevant_similarities = similarity_data.loc[article_ids].median()
        elif op == "max":
            relevant_similarities = similarity_data.loc[article_ids].max()
        elif op == "75th_percentile":
            relevant_similarities = similarity_data.loc[article_ids].quantile(0.75)

        # Append the results to the corresponding lists
        customer_id_list.append(customer_id)
        description_emb_similarity_list.append(relevant_similarities['description_emb_similarity'])
        images_df_similarity_list.append(relevant_similarities['images_df_similarity'])
        processed_articles_similarity_list.append(relevant_similarities['processed_articles_similarity'])
        average_similarity_list.append(relevant_similarities['average_similarity'])

    # Create a new DataFrame from the results and return it
    return pd.DataFrame({
        'customer_id': customer_id_list,
        'description_emb_similarity': description_emb_similarity_list,
        'images_df_similarity': images_df_similarity_list,
        'processed_articles_similarity': processed_articles_similarity_list,
        'average_similarity': average_similarity_list
    })


In [13]:
result_df_mean = calculate_similarity(df, df_merged,"mean")
result_df_with_exist_test_mean = result_df_mean.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

result_df_max = calculate_similarity(df, df_merged,"max")
result_df_with_exist_test_max = result_df_max.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

result_df_median = calculate_similarity(df, df_merged,"median")
result_df_with_exist_test_median = result_df_median.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

result_df_75th_percentile = calculate_similarity(df, df_merged,"75th_percentile")
result_df_with_exist_test_75th_percentile = result_df_75th_percentile.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

# Running 5 models for all similarities of all dimensions

# SVC Model:

In [14]:
from sklearn.svm import SVC

def evaluate_svc_model(df, n_runs=10):
    accuracy_list = []
    precision_list = []
    recall_list = []

    for _ in range(n_runs):
        X = df[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']]
        y = df['exist_test']

        # Split data into training and testing sets with stratification
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Initialize SVM classifier
        svc_model = SVC(kernel='poly', random_state=42)

        # Train the model
        svc_model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = svc_model.predict(X_test)

        # Evaluate model performance
        accuracy_list.append(accuracy_score(y_test, y_pred))
        precision_list.append(precision_score(y_test, y_pred))
        recall_list.append(recall_score(y_test, y_pred))

    # Calculate the average of accuracy, precision, and recall over all runs
    accuracy_avg = np.mean(accuracy_list)
    precision_avg = np.mean(precision_list)
    recall_avg = np.mean(recall_list)

    return accuracy_avg, precision_avg, recall_avg

# List of datasets
datasets = [
    ("result_df_with_exist_test_mean", result_df_with_exist_test_mean),
    ("result_df_with_exist_test_max", result_df_with_exist_test_max),
    ("result_df_with_exist_test_median", result_df_with_exist_test_median),
    ("result_df_with_exist_test_75th_percentile", result_df_with_exist_test_75th_percentile)
]

# Initialize list to store results
results = []

# Loop through each dataset and evaluate the model
for name, df in datasets:
    accuracy_avg, precision_avg, recall_avg = evaluate_svc_model(df, n_runs=10)
    results.append({
        "Dataset": name,
        "Accuracy": accuracy_avg,
        "Precision": precision_avg,
        "Recall": recall_avg
    })

# Convert results to DataFrame
results_svc = pd.DataFrame(results)
results_svc

Unnamed: 0,Dataset,Accuracy,Precision,Recall
0,result_df_with_exist_test_mean,0.65,0.714286,0.5
1,result_df_with_exist_test_max,0.65,0.714286,0.5
2,result_df_with_exist_test_median,0.6,0.625,0.5
3,result_df_with_exist_test_75th_percentile,0.55,1.0,0.1


# Logistic regression Model:

In [15]:
from sklearn.linear_model import LogisticRegression

def evaluate_logistic_regression_model(df, n_runs=10):
    accuracy_list = []
    precision_list = []
    recall_list = []

    for _ in range(n_runs):
        X = df[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']]
        y = df['exist_test']

        # Split data into training and testing sets with stratification
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Initialize Logistic Regression model
        log_reg_model = LogisticRegression(random_state=42, max_iter=1000)

        # Train the model
        log_reg_model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = log_reg_model.predict(X_test)

        # Evaluate model performance
        accuracy_list.append(accuracy_score(y_test, y_pred))
        precision_list.append(precision_score(y_test, y_pred))
        recall_list.append(recall_score(y_test, y_pred))

    # Calculate the average of accuracy, precision, and recall over all runs
    accuracy_avg = np.mean(accuracy_list)
    precision_avg = np.mean(precision_list)
    recall_avg = np.mean(recall_list)

    return accuracy_avg, precision_avg, recall_avg

# List of datasets
datasets = [
    ("result_df_with_exist_test_mean", result_df_with_exist_test_mean),
    ("result_df_with_exist_test_max", result_df_with_exist_test_max),
    ("result_df_with_exist_test_median", result_df_with_exist_test_median),
    ("result_df_with_exist_test_75th_percentile", result_df_with_exist_test_75th_percentile)
]

# Initialize list to store results
results = []

# Loop through each dataset and evaluate the model
for name, df in datasets:
    accuracy_avg, precision_avg, recall_avg = evaluate_logistic_regression_model(df, n_runs=10)
    results.append({
        "Dataset": name,
        "Accuracy": accuracy_avg,
        "Precision": precision_avg,
        "Recall": recall_avg
    })

# Convert results to DataFrame
results_log_reg = pd.DataFrame(results)
results_log_reg


Unnamed: 0,Dataset,Accuracy,Precision,Recall
0,result_df_with_exist_test_mean,0.55,0.555556,0.5
1,result_df_with_exist_test_max,0.65,0.714286,0.5
2,result_df_with_exist_test_median,0.55,0.571429,0.4
3,result_df_with_exist_test_75th_percentile,0.7,0.75,0.6


# Random Forest Classifier Model

In [16]:
from sklearn.ensemble import RandomForestClassifier

def evaluate_random_forest_model(df, n_runs=10):
    accuracy_list = []
    precision_list = []
    recall_list = []

    for _ in range(n_runs):
        X = df[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']]
        y = df['exist_test']

        # Split data into training and testing sets with stratification
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Initialize Random Forest Classifier
        rf_model = RandomForestClassifier(random_state=42, n_estimators=100)

        # Train the model
        rf_model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = rf_model.predict(X_test)

        # Evaluate model performance
        accuracy_list.append(accuracy_score(y_test, y_pred))
        precision_list.append(precision_score(y_test, y_pred))
        recall_list.append(recall_score(y_test, y_pred))

    # Calculate the average of accuracy, precision, and recall over all runs
    accuracy_avg = np.mean(accuracy_list)
    precision_avg = np.mean(precision_list)
    recall_avg = np.mean(recall_list)

    return accuracy_avg, precision_avg, recall_avg

# List of datasets
datasets = [
    ("result_df_with_exist_test_mean", result_df_with_exist_test_mean),
    ("result_df_with_exist_test_max", result_df_with_exist_test_max),
    ("result_df_with_exist_test_median", result_df_with_exist_test_median),
    ("result_df_with_exist_test_75th_percentile", result_df_with_exist_test_75th_percentile)
]

# Initialize list to store results
results = []

# Loop through each dataset and evaluate the model
for name, df in datasets:
    accuracy_avg, precision_avg, recall_avg = evaluate_random_forest_model(df, n_runs=10)
    results.append({
        "Dataset": name,
        "Accuracy": accuracy_avg,
        "Precision": precision_avg,
        "Recall": recall_avg
    })

# Convert results to DataFrame
results_rf = pd.DataFrame(results)
results_rf


Unnamed: 0,Dataset,Accuracy,Precision,Recall
0,result_df_with_exist_test_mean,0.6,0.6,0.6
1,result_df_with_exist_test_max,0.6,0.6,0.6
2,result_df_with_exist_test_median,0.55,0.555556,0.5
3,result_df_with_exist_test_75th_percentile,0.65,0.636364,0.7


# XGB Classifier Model

In [17]:
from xgboost import XGBClassifier
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

def evaluate_xgb_model(df, n_runs=10):
    accuracy_list = []
    precision_list = []
    recall_list = []

    for _ in range(n_runs):
        X = df[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']]
        y = df['exist_test']

        # Split data into training and testing sets with stratification
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Initialize XGB Classifier
        xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

        # Train the model
        xgb_model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = xgb_model.predict(X_test)

        # Evaluate model performance
        accuracy_list.append(accuracy_score(y_test, y_pred))
        precision_list.append(precision_score(y_test, y_pred))
        recall_list.append(recall_score(y_test, y_pred))

    # Calculate the average of accuracy, precision, and recall over all runs
    accuracy_avg = np.mean(accuracy_list)
    precision_avg = np.mean(precision_list)
    recall_avg = np.mean(recall_list)

    return accuracy_avg, precision_avg, recall_avg

# List of datasets
datasets = [
    ("result_df_with_exist_test_mean", result_df_with_exist_test_mean),
    ("result_df_with_exist_test_max", result_df_with_exist_test_max),
    ("result_df_with_exist_test_median", result_df_with_exist_test_median),
    ("result_df_with_exist_test_75th_percentile", result_df_with_exist_test_75th_percentile)
]

# Initialize list to store results
results = []

# Loop through each dataset and evaluate the model
for name, df in datasets:
    accuracy_avg, precision_avg, recall_avg = evaluate_xgb_model(df, n_runs=10)
    results.append({
        "Dataset": name,
        "Accuracy": accuracy_avg,
        "Precision": precision_avg,
        "Recall": recall_avg
    })

# Convert results to DataFrame
results_xgb = pd.DataFrame(results)
results_xgb


Unnamed: 0,Dataset,Accuracy,Precision,Recall
0,result_df_with_exist_test_mean,0.5,0.5,0.6
1,result_df_with_exist_test_max,0.5,0.5,0.7
2,result_df_with_exist_test_median,0.45,0.444444,0.4
3,result_df_with_exist_test_75th_percentile,0.85,0.888889,0.8


# K Neighbors Classifier Model

In [18]:
from sklearn.neighbors import KNeighborsClassifier

def evaluate_knn_model(df, n_runs=10, n_neighbors=5):
    accuracy_list = []
    precision_list = []
    recall_list = []

    for _ in range(n_runs):
        X = df[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']]
        y = df['exist_test']

        # Split data into training and testing sets with stratification
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Initialize K-Neighbors Classifier
        knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)

        # Train the model
        knn_model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = knn_model.predict(X_test)

        # Evaluate model performance
        accuracy_list.append(accuracy_score(y_test, y_pred))
        precision_list.append(precision_score(y_test, y_pred))
        recall_list.append(recall_score(y_test, y_pred))

    # Calculate the average of accuracy, precision, and recall over all runs
    accuracy_avg = np.mean(accuracy_list)
    precision_avg = np.mean(precision_list)
    recall_avg = np.mean(recall_list)

    return accuracy_avg, precision_avg, recall_avg

# List of datasets
datasets = [
    ("result_df_with_exist_test_mean", result_df_with_exist_test_mean),
    ("result_df_with_exist_test_max", result_df_with_exist_test_max),
    ("result_df_with_exist_test_median", result_df_with_exist_test_median),
    ("result_df_with_exist_test_75th_percentile", result_df_with_exist_test_75th_percentile)
]

# Initialize list to store results
results = []

# Loop through each dataset and evaluate the model
for name, df in datasets:
    accuracy_avg, precision_avg, recall_avg = evaluate_knn_model(df, n_runs=10)
    results.append({
        "Dataset": name,
        "Accuracy": accuracy_avg,
        "Precision": precision_avg,
        "Recall": recall_avg
    })

# Convert results to DataFrame
results_knn = pd.DataFrame(results)
results_knn

Unnamed: 0,Dataset,Accuracy,Precision,Recall
0,result_df_with_exist_test_mean,0.55,0.555556,0.5
1,result_df_with_exist_test_max,0.65,0.714286,0.5
2,result_df_with_exist_test_median,0.5,0.5,0.5
3,result_df_with_exist_test_75th_percentile,0.65,0.666667,0.6


# Preparation and export of the results of the binary calculation and the models

In [19]:
# Function to prepare results
def prepare_results(results, model_name):
    results_model = results.copy()
    results_model['Model'] = model_name

    # Ensure 'Dataset' column is correctly named and mapped
    if 'Dataset' in results_model.columns:
        results_model['data'] = results_model['Dataset'].replace({
            'result_df_with_exist_test_mean': 'Mean',
            'result_df_with_exist_test_max': 'Max',
            'result_df_with_exist_test_median': 'Median',
            'result_df_with_exist_test_75th_percentile': '75th Percentile'
        })
    else:
        raise KeyError("Column 'Dataset' not found in the DataFrame.")

    # Drop the old 'Dataset' column and reorder columns
    results_model = results_model.drop(columns=['Dataset'], errors='ignore')
    return results_model[['Model', 'data', 'Accuracy', 'Precision', 'Recall']]

# Prepare results for each model
results_svc_formatted = prepare_results(results_svc, 'SVC')
results_log_reg_formatted = prepare_results(results_log_reg, 'LR')
results_rf_formatted = prepare_results(results_rf, 'RF')
results_xgb_formatted = prepare_results(results_xgb, 'XGB')
results_knn_formatted = prepare_results(results_knn, 'KNN')

# Combine all formatted results
results_combined = pd.concat([
    results_svc_formatted,
    results_log_reg_formatted,
    results_rf_formatted,
    results_xgb_formatted,
    results_knn_formatted
])

# Define the path to save the CSV file
path = "/content/gdrive/MyDrive/" + str(number) + "_Multimodal.csv"

# Export the combined DataFrame to a CSV file
results_combined.to_csv(path, index=False)

# To run the code on hundreds of products - we put all the commands into a function and ran in a loop on the list we selected, In the rest of the notebook, the code appears individually for one product - not within a function

In [20]:
"""
def func(number):
  import pandas as pd
  import numpy as np
  from numpy.linalg import norm
  import random
  import datetime as dt
  from sklearn.metrics.pairwise import cosine_similarity
  import itertools
  from sklearn.metrics import precision_score, recall_score
  import ast
  from google.colab import drive

  from sklearn.model_selection import train_test_split
  from sklearn.svm import SVC
  from sklearn.metrics import accuracy_score, classification_report

  from sklearn.linear_model import LogisticRegression

  path = '/content/gdrive/MyDrive/'

  drive.mount('/content/gdrive')

  test_article_id = number
  number = str(number)+"_"

  filtered_description_emb = pd.read_csv(path + number + 'filtered_description_emb.csv', index_col=0)
  filtered_description_emb2 = pd.read_csv(path + number + 'filtered_description_emb2.csv', index_col=0)
  #filtered_description_emb2_avg_vector = pd.read_csv(path + number + 'filtered_description_emb2_avg_vector.csv', index_col=0)

  filtered_images_df = pd.read_csv(path + number + 'filtered_images_df.csv', index_col=0)
  filtered_images_df2 = pd.read_csv(path + number + 'filtered_images_df2.csv', index_col=0)
  #filtered_images_df2_avg_vector = pd.read_csv(path + number + 'filtered_images_df2_avg_vector.csv', index_col=0)

  filtered_processed_articles = pd.read_csv(path + number + 'filtered_processed_articles.csv', index_col=0)
  filtered_processed_articles2 = pd.read_csv(path + number + 'filtered_processed_articles2.csv', index_col=0)
  #filtered_processed_articles2_avg_vector = pd.read_csv(path + number + 'filtered_processed_articles2_avg_vector.csv', index_col=0)

  filtered_grouped_data = pd.read_csv(path + number + 'filtered_grouped_data.csv', index_col=0)

  test_article_id = filtered_description_emb2.index[0]


  filtered_description_emb2 = filtered_description_emb2_avg_vector
  filtered_images_df2 = filtered_images_df2_avg_vector
  filtered_processed_articles2 = filtered_processed_articles2_avg_vector

  #מציאת הדימיון של התיאור בין כל מוצר למוצר הטסט

  # Extract the single vector from filtered_description_emb2
  vec2 = filtered_description_emb2.iloc[0].values

  # Define a function to compute cosine similarity
  def cosine_similarity(vec1, vec2):
      dot_product = np.dot(vec1, vec2)
      norm_vec1 = np.linalg.norm(vec1)
      norm_vec2 = np.linalg.norm(vec2)
      return dot_product / (norm_vec1 * norm_vec2)

  # Compute cosine similarity for each row in filtered_description_emb
  similarities_description_emb = filtered_description_emb.apply(lambda row: cosine_similarity(row.values, vec2), axis=1)

  #### scaler
  min_value = similarities_description_emb.min()
  max_value = similarities_description_emb.max()
  similarities_description_emb = (similarities_description_emb - min_value) / (max_value - min_value)
  ####

  #מציאת הדימיון של התמונה בין כל מוצר למוצר הטסט

  # Extract the single vector from filtered_description_emb2
  vec2 = filtered_images_df2.iloc[0].values

  # Define a function to compute cosine similarity
  def cosine_similarity(vec1, vec2):
      dot_product = np.dot(vec1, vec2)
      norm_vec1 = np.linalg.norm(vec1)
      norm_vec2 = np.linalg.norm(vec2)
      return dot_product / (norm_vec1 * norm_vec2)

  # Compute cosine similarity for each row in filtered_description_emb
  similarities_images_df = filtered_images_df.apply(lambda row: cosine_similarity(row.values, vec2), axis=1)

  #### scaler
  min_value = similarities_images_df.min()
  max_value = similarities_images_df.max()
  similarities_images_df = (similarities_images_df - min_value) / (max_value - min_value)
  ####


  #מציאת הדימיון של הטאבולר בין כל מוצר למוצר הטסט

  # Extract the single vector from filtered_description_emb2
  vec2 = filtered_processed_articles2.iloc[0].values

  # Define a function to compute cosine similarity
  def cosine_similarity(vec1, vec2):
      dot_product = np.dot(vec1, vec2)
      norm_vec1 = np.linalg.norm(vec1)
      norm_vec2 = np.linalg.norm(vec2)
      return dot_product / (norm_vec1 * norm_vec2)

  # Compute cosine similarity for each row in filtered_description_emb
  similarities_processed_articles = filtered_processed_articles.apply(lambda row: cosine_similarity(row.values, vec2), axis=1)

  #### scaler
  min_value = similarities_processed_articles.min()
  max_value = similarities_processed_articles.max()
  similarities_processed_articles = (similarities_processed_articles - min_value) / (max_value - min_value)
  ####


  # יצירת דאטה פריים עם הדימיון של 3 המימדים והממוצע שלהם
  df_desc_emb = pd.DataFrame(similarities_description_emb)
  df_images = pd.DataFrame(similarities_images_df)
  df_articles = pd.DataFrame(similarities_processed_articles)

  # Rename columns for consistency
  df_articles.index.name = 'product_id'

  # Merge the dataframes on 'product_id'
  df_merged = pd.merge(df_desc_emb, df_images, on='product_id', how='outer')
  df_merged = pd.merge(df_merged, df_articles, on='product_id', how='outer')

  df_merged = df_merged.rename(columns={'0_x': 'description_emb_similarity', '0_y': 'images_df_similarity', 0: 'processed_articles_similarity'})

  # Calculate the average similarity
  df_merged['average_similarity'] = df_merged[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']].mean(axis=1)

  # Display the final dataframe
  # מחברים בין הדאטה של הלקוחות והמוצרים הייחודים שלהם להאם הלקוח רכש את מוצר הטסט
  df = pd.DataFrame(filtered_grouped_data).copy()
  df['unique_article_ids'] = df['unique_article_ids'].apply(ast.literal_eval)

  # Function to remove test_article_id from unique_article_ids and check existence
  def remove_and_check(row):
        if test_article_id in row['unique_article_ids']:
            row['unique_article_ids'].remove(test_article_id)
            row['exist_test'] = 1
        else:
            row['exist_test'] = 0
        return row

  # Apply the function to each row
  df = df.apply(remove_and_check, axis=1)

  # Check the updated DataFrame
  # חישוב הדימיון של כל המוצרים שרכש לקוח לפי מימד

  def calculate_similarity(customer_data, similarity_data, op):
      customer_id_list = []
      description_emb_similarity_list = []
      images_df_similarity_list = []
      processed_articles_similarity_list = []
      average_similarity_list = []

      for index, row in customer_data.iterrows():
          customer_id = row['customer_id']
          article_ids = row['unique_article_ids']

          # Filter out the similarity data for the given article ids
          if op == "mean":
              relevant_similarities = similarity_data.loc[article_ids].mean()
          elif op == "median":
              relevant_similarities = similarity_data.loc[article_ids].median()
          elif op == "max":
              relevant_similarities = similarity_data.loc[article_ids].max()
          elif op == "75th_percentile":
              relevant_similarities = similarity_data.loc[article_ids].quantile(0.75)

          customer_id_list.append(customer_id)
          description_emb_similarity_list.append(relevant_similarities['description_emb_similarity'])
          images_df_similarity_list.append(relevant_similarities['images_df_similarity'])
          processed_articles_similarity_list.append(relevant_similarities['processed_articles_similarity'])
          average_similarity_list.append(relevant_similarities['average_similarity'])

      return pd.DataFrame({
          'customer_id': customer_id_list,
          'description_emb_similarity': description_emb_similarity_list,
          'images_df_similarity': images_df_similarity_list,
          'processed_articles_similarity': processed_articles_similarity_list,
          'average_similarity': average_similarity_list
      })

  result_df_mean = calculate_similarity(df, df_merged,"mean")
  result_df_with_exist_test_mean = result_df_mean.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

  result_df_max = calculate_similarity(df, df_merged,"max")
  result_df_with_exist_test_max = result_df_max.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

  result_df_median = calculate_similarity(df, df_merged,"median")
  result_df_with_exist_test_median = result_df_median.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

  result_df_75th_percentile = calculate_similarity(df, df_merged,"75th_percentile")
  result_df_with_exist_test_75th_percentile = result_df_75th_percentile.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

  from sklearn.model_selection import train_test_split
  from sklearn.svm import SVC
  from sklearn.metrics import accuracy_score, precision_score, recall_score
  import pandas as pd
  import numpy as np

  def evaluate_svc_model(df, n_runs=10):
      accuracy_list = []
      precision_list = []
      recall_list = []

      for _ in range(n_runs):
          X = df[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']]
          y = df['exist_test']

          # Split data into training and testing sets with stratification
          X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

          # Initialize SVM classifier
          svc_model = SVC(kernel='poly', random_state=42)

          # Train the model
          svc_model.fit(X_train, y_train)

          # Predict on the test set
          y_pred = svc_model.predict(X_test)

          # Evaluate model performance
          accuracy_list.append(accuracy_score(y_test, y_pred))
          precision_list.append(precision_score(y_test, y_pred))
          recall_list.append(recall_score(y_test, y_pred))

      # Calculate the average of accuracy, precision, and recall over all runs
      accuracy_avg = np.mean(accuracy_list)
      precision_avg = np.mean(precision_list)
      recall_avg = np.mean(recall_list)

      return accuracy_avg, precision_avg, recall_avg

  # List of datasets
  datasets = [
      ("result_df_with_exist_test_mean", result_df_with_exist_test_mean),
      ("result_df_with_exist_test_max", result_df_with_exist_test_max),
      ("result_df_with_exist_test_median", result_df_with_exist_test_median),
      ("result_df_with_exist_test_75th_percentile", result_df_with_exist_test_75th_percentile)
  ]

  # Initialize list to store results
  results = []

  # Loop through each dataset and evaluate the model
  for name, df in datasets:
      accuracy_avg, precision_avg, recall_avg = evaluate_svc_model(df, n_runs=10)
      results.append({
          "Dataset": name,
          "Accuracy": accuracy_avg,
          "Precision": precision_avg,
          "Recall": recall_avg
      })

  # Convert results to DataFrame
  results_svc = pd.DataFrame(results)




  def evaluate_logistic_regression_model(df, n_runs=10):
      accuracy_list = []
      precision_list = []
      recall_list = []

      for _ in range(n_runs):
          X = df[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']]
          y = df['exist_test']

          # Split data into training and testing sets with stratification
          X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

          # Initialize Logistic Regression model
          log_reg_model = LogisticRegression(random_state=42, max_iter=1000)

          # Train the model
          log_reg_model.fit(X_train, y_train)

          # Predict on the test set
          y_pred = log_reg_model.predict(X_test)

          # Evaluate model performance
          accuracy_list.append(accuracy_score(y_test, y_pred))
          precision_list.append(precision_score(y_test, y_pred))
          recall_list.append(recall_score(y_test, y_pred))

      # Calculate the average of accuracy, precision, and recall over all runs
      accuracy_avg = np.mean(accuracy_list)
      precision_avg = np.mean(precision_list)
      recall_avg = np.mean(recall_list)

      return accuracy_avg, precision_avg, recall_avg

  # List of datasets
  datasets = [
      ("result_df_with_exist_test_mean", result_df_with_exist_test_mean),
      ("result_df_with_exist_test_max", result_df_with_exist_test_max),
      ("result_df_with_exist_test_median", result_df_with_exist_test_median),
      ("result_df_with_exist_test_75th_percentile", result_df_with_exist_test_75th_percentile)
  ]

  # Initialize list to store results
  results = []

  # Loop through each dataset and evaluate the model
  for name, df in datasets:
      accuracy_avg, precision_avg, recall_avg = evaluate_logistic_regression_model(df, n_runs=10)
      results.append({
          "Dataset": name,
          "Accuracy": accuracy_avg,
          "Precision": precision_avg,
          "Recall": recall_avg
      })

  # Convert results to DataFrame
  results_log_reg = pd.DataFrame(results)



  def evaluate_random_forest_model(df, n_runs=10):
      accuracy_list = []
      precision_list = []
      recall_list = []

      for _ in range(n_runs):
          X = df[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']]
          y = df['exist_test']

          # Split data into training and testing sets with stratification
          X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

          # Initialize Random Forest Classifier
          rf_model = RandomForestClassifier(random_state=42, n_estimators=100)

          # Train the model
          rf_model.fit(X_train, y_train)

          # Predict on the test set
          y_pred = rf_model.predict(X_test)

          # Evaluate model performance
          accuracy_list.append(accuracy_score(y_test, y_pred))
          precision_list.append(precision_score(y_test, y_pred))
          recall_list.append(recall_score(y_test, y_pred))

      # Calculate the average of accuracy, precision, and recall over all runs
      accuracy_avg = np.mean(accuracy_list)
      precision_avg = np.mean(precision_list)
      recall_avg = np.mean(recall_list)

      return accuracy_avg, precision_avg, recall_avg

  # List of datasets
  datasets = [
      ("result_df_with_exist_test_mean", result_df_with_exist_test_mean),
      ("result_df_with_exist_test_max", result_df_with_exist_test_max),
      ("result_df_with_exist_test_median", result_df_with_exist_test_median),
      ("result_df_with_exist_test_75th_percentile", result_df_with_exist_test_75th_percentile)
  ]

  # Initialize list to store results
  results = []

  # Loop through each dataset and evaluate the model
  for name, df in datasets:
      accuracy_avg, precision_avg, recall_avg = evaluate_random_forest_model(df, n_runs=10)
      results.append({
          "Dataset": name,
          "Accuracy": accuracy_avg,
          "Precision": precision_avg,
          "Recall": recall_avg
      })

  # Convert results to DataFrame
  results_rf = pd.DataFrame(results)



  def evaluate_xgb_model(df, n_runs=10):
    accuracy_list = []
    precision_list = []
    recall_list = []

    for _ in range(n_runs):
        X = df[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']]
        y = df['exist_test']

        # Split data into training and testing sets with stratification
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Initialize XGB Classifier
        xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

        # Train the model
        xgb_model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = xgb_model.predict(X_test)

        # Evaluate model performance
        accuracy_list.append(accuracy_score(y_test, y_pred))
        precision_list.append(precision_score(y_test, y_pred))
        recall_list.append(recall_score(y_test, y_pred))

    # Calculate the average of accuracy, precision, and recall over all runs
    accuracy_avg = np.mean(accuracy_list)
    precision_avg = np.mean(precision_list)
    recall_avg = np.mean(recall_list)

    return accuracy_avg, precision_avg, recall_avg

  # List of datasets
  datasets = [
      ("result_df_with_exist_test_mean", result_df_with_exist_test_mean),
      ("result_df_with_exist_test_max", result_df_with_exist_test_max),
      ("result_df_with_exist_test_median", result_df_with_exist_test_median),
      ("result_df_with_exist_test_75th_percentile", result_df_with_exist_test_75th_percentile)
  ]

  # Initialize list to store results
  results = []

  # Loop through each dataset and evaluate the model
  for name, df in datasets:
      accuracy_avg, precision_avg, recall_avg = evaluate_xgb_model(df, n_runs=10)
      results.append({
          "Dataset": name,
          "Accuracy": accuracy_avg,
          "Precision": precision_avg,
          "Recall": recall_avg
      })

  # Convert results to DataFrame
  results_xgb = pd.DataFrame(results)



  def evaluate_knn_model(df, n_runs=10, n_neighbors=5):
      accuracy_list = []
      precision_list = []
      recall_list = []

      for _ in range(n_runs):
          X = df[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']]
          y = df['exist_test']

          # Split data into training and testing sets with stratification
          X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

          # Initialize K-Neighbors Classifier
          knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)

          # Train the model
          knn_model.fit(X_train, y_train)

          # Predict on the test set
          y_pred = knn_model.predict(X_test)

          # Evaluate model performance
          accuracy_list.append(accuracy_score(y_test, y_pred))
          precision_list.append(precision_score(y_test, y_pred))
          recall_list.append(recall_score(y_test, y_pred))

      # Calculate the average of accuracy, precision, and recall over all runs
      accuracy_avg = np.mean(accuracy_list)
      precision_avg = np.mean(precision_list)
      recall_avg = np.mean(recall_list)

      return accuracy_avg, precision_avg, recall_avg

  # List of datasets
  datasets = [
      ("result_df_with_exist_test_mean", result_df_with_exist_test_mean),
      ("result_df_with_exist_test_max", result_df_with_exist_test_max),
      ("result_df_with_exist_test_median", result_df_with_exist_test_median),
      ("result_df_with_exist_test_75th_percentile", result_df_with_exist_test_75th_percentile)
  ]

  # Initialize list to store results
  results = []

  # Loop through each dataset and evaluate the model
  for name, df in datasets:
      accuracy_avg, precision_avg, recall_avg = evaluate_knn_model(df, n_runs=10)
      results.append({
          "Dataset": name,
          "Accuracy": accuracy_avg,
          "Precision": precision_avg,
          "Recall": recall_avg
      })

  # Convert results to DataFrame
  results_knn = pd.DataFrame(results)


  import pandas as pd

  # Function to prepare results
  def prepare_results(results, model_name):
      results_model = results.copy()
      results_model['Model'] = model_name

      # Ensure 'Dataset' column is correctly named and mapped
      if 'Dataset' in results_model.columns:
          results_model['data'] = results_model['Dataset'].replace({
              'result_df_with_exist_test_mean': 'Mean',
              'result_df_with_exist_test_max': 'Max',
              'result_df_with_exist_test_median': 'Median',
              'result_df_with_exist_test_75th_percentile': '75th Percentile'
          })
      else:
          raise KeyError("Column 'Dataset' not found in the DataFrame.")

      # Drop the old 'Dataset' column and reorder columns
      results_model = results_model.drop(columns=['Dataset'], errors='ignore')
      return results_model[['Model', 'data', 'Accuracy', 'Precision', 'Recall']]

  # Prepare results for each model
  results_svc_formatted = prepare_results(results_svc, 'SVC')
  results_log_reg_formatted = prepare_results(results_log_reg, 'LR')
  results_rf_formatted = prepare_results(results_rf, 'RF')
  results_xgb_formatted = prepare_results(results_xgb, 'XGB')
  results_knn_formatted = prepare_results(results_knn, 'KNN')

  # Combine all formatted results
  results_combined = pd.concat([
      results_svc_formatted,
      results_log_reg_formatted,
      results_rf_formatted,
      results_xgb_formatted,
      results_knn_formatted
  ])

  # Define the path to save the CSV file
  path = "/content/gdrive/MyDrive/" + str(number) + "_Multimodal_avg_vector.csv"

  # Export the combined DataFrame to a CSV file
  results_combined.to_csv(path, index=False)
"""

"""
for i in range (0,len(lst_csv)):
  if i in []:
    continue
  func(lst_csv[i])
  print(i,lst_csv[i])
"""

'\nfor i in range (0,len(lst_csv)):\n  if i in []:\n    continue\n  func(lst_csv[i])\n  print(i,lst_csv[i])\n'