# Importing libraries and loading the data sets of the products

In [35]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
import random
import datetime as dt
from sklearn.metrics.pairwise import cosine_similarity
import itertools
from sklearn.metrics import precision_score, recall_score
import ast
from google.colab import drive

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import glob
import re

In [36]:
path = '/content/gdrive/MyDrive/'
drive.mount('/content/gdrive')

all_articles = glob.glob(path + '*_filtered_grouped_data.csv')
print (len(all_articles))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
131


In [37]:
csv_files_test = []
for csv in all_articles:
  number = re.search(r'\d+', csv).group()
  csv_files_test.append(number)

print (len(csv_files_test))

131


In [38]:
number = csv_files_test[0]+"_"

filtered_description_emb = pd.read_csv(path + number + 'filtered_description_emb.csv', index_col=0)
filtered_description_emb2 = pd.read_csv(path + number + 'filtered_description_emb2.csv', index_col=0)
filtered_description_emb2_avg_vector = pd.read_csv(path + number + 'filtered_description_emb2_avg_vector.csv', index_col=0)

filtered_images_df = pd.read_csv(path + number + 'filtered_images_df.csv', index_col=0)
filtered_images_df2 = pd.read_csv(path + number + 'filtered_images_df2.csv', index_col=0)
filtered_images_df2_avg_vector = pd.read_csv(path + number + 'filtered_images_df2_avg_vector.csv', index_col=0)

filtered_processed_articles = pd.read_csv(path + number + 'filtered_processed_articles.csv', index_col=0)
filtered_processed_articles2 = pd.read_csv(path + number + 'filtered_processed_articles2.csv', index_col=0)
filtered_processed_articles2_avg_vector = pd.read_csv(path + number + 'filtered_processed_articles2_avg_vector.csv', index_col=0)

filtered_grouped_data = pd.read_csv(path + number + 'filtered_grouped_data.csv', index_col=0)

test_article_id = filtered_description_emb2.index[0]
test_article_id

717464001

In [None]:
"""
# To run the calculation for the average vector - it is required to switch between the above files + change the name of the exported file
filtered_description_emb2 = filtered_description_emb2_avg_vector.copy()
filtered_images_df2 = filtered_images_df2_avg_vector.copy()
filtered_processed_articles2 = filtered_processed_articles2_avg_vector.copy()
"""

# Calculating the similarity of each test product with all the products in the training data - for the 3 dimensions and calculating the average similarity

In [39]:
#Finding the similarity of the description between each product and the test product

# Extract the single vector from filtered_description_emb2 (test vector)
vec2 = filtered_description_emb2.iloc[0].values

# Define a function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Compute cosine similarity for each row in filtered_description_emb
similarities_description_emb = filtered_description_emb.apply(lambda row: cosine_similarity(row.values, vec2), axis=1)

#### scaler 0 to 1
min_value = similarities_description_emb.min()
max_value = similarities_description_emb.max()
similarities_description_emb = (similarities_description_emb - min_value) / (max_value - min_value)
####

# Display the similarities
similarities_description_emb.head()

Unnamed: 0_level_0,0
product_id,Unnamed: 1_level_1
108775015,0.625123
120129001,0.606231
123173001,0.451778
144993001,0.301675
146730001,0.375402


In [40]:
#Finding the similarity of the image between each product and the test product

# Extract the single vector from filtered_description_emb2
vec2 = filtered_images_df2.iloc[0].values

# Define a function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Compute cosine similarity for each row in filtered_description_emb
similarities_images_df = filtered_images_df.apply(lambda row: cosine_similarity(row.values, vec2), axis=1)

#### scaler
min_value = similarities_images_df.min()
max_value = similarities_images_df.max()
similarities_images_df = (similarities_images_df - min_value) / (max_value - min_value)
####

# Display the similarities
similarities_images_df.head()


Unnamed: 0_level_0,0
product_id,Unnamed: 1_level_1
377277001,0.799937
507909001,0.275926
578476001,0.479739
553611001,0.430853
680186001,0.443783


In [41]:
#Finding the similarity of the tabular between each product and the test product

# Extract the single vector from filtered_description_emb2
vec2 = filtered_processed_articles2.iloc[0].values

# Define a function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Compute cosine similarity for each row in filtered_description_emb
similarities_processed_articles = filtered_processed_articles.apply(lambda row: cosine_similarity(row.values, vec2), axis=1)

#### scaler
min_value = similarities_processed_articles.min()
max_value = similarities_processed_articles.max()
similarities_processed_articles = (similarities_processed_articles - min_value) / (max_value - min_value)
####


# Display the similarities
similarities_processed_articles.head()


Unnamed: 0_level_0,0
article_id,Unnamed: 1_level_1
108775015,0.75
120129001,0.25
123173001,0.25
144993001,0.25
146730001,0.25


In [42]:
# Creating a data frame with the image of each product for the test product in 3 dimensions and their average

# Create a DataFrame
df_desc_emb = pd.DataFrame(similarities_description_emb)
df_images = pd.DataFrame(similarities_images_df)
df_articles = pd.DataFrame(similarities_processed_articles)

# Rename columns for consistency
df_articles.index.name = 'product_id'

# Merge the dataframes on 'product_id'
df_merged = pd.merge(df_desc_emb, df_images, on='product_id', how='outer')
df_merged = pd.merge(df_merged, df_articles, on='product_id', how='outer')

df_merged = df_merged.rename(columns={'0_x': 'description_emb_similarity', '0_y': 'images_df_similarity', 0: 'processed_articles_similarity'})

# Calculate the average similarity
df_merged['average_similarity'] = df_merged[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']].mean(axis=1)

# Display the final dataframe
df_merged

Unnamed: 0_level_0,description_emb_similarity,images_df_similarity,processed_articles_similarity,average_similarity
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
108775015,0.625123,0.985692,0.75,0.786938
120129001,0.606231,0.346800,0.25,0.401010
123173001,0.451778,0.406332,0.25,0.369370
144993001,0.301675,0.375234,0.25,0.308970
146730001,0.375402,0.415122,0.25,0.346841
...,...,...,...,...
920700002,0.540820,0.569802,0.75,0.620207
921096003,0.462711,0.409901,0.25,0.374204
926825001,0.351678,0.486823,0.50,0.446167
927865001,0.725313,0.532240,0.50,0.585851


# Consolidation of the similarity of the products for each customer in 4 methods: average, median, 75th percentile and maximum of the basket of products for each customer

In [43]:
# Connecting the data of the customers and their unique products to whether the customer purchased the test product
df = pd.DataFrame(filtered_grouped_data).copy()
df['unique_article_ids'] = df['unique_article_ids'].apply(ast.literal_eval)

# Function to remove test_article_id from unique_article_ids and check existence
def remove_and_check(row):
      if test_article_id in row['unique_article_ids']:
          row['unique_article_ids'].remove(test_article_id)
          row['exist_test'] = 1
      else:
          row['exist_test'] = 0
      return row

# Apply the function to each row
df = df.apply(remove_and_check, axis=1)

# Check the updated DataFrame
df

Unnamed: 0,customer_id,unique_article_ids,exist_test
4883,0161b094e87ecd811ace003a01222068afe8393e93a46c...,"[586244001, 586273001, 585158001, 585130001, 5...",0
6477,01cc53c3fa779e7eea61e6fa87745a6707e4f87c9b3b8c...,"[715624001, 689005001, 817198001, 682238001, 6...",0
10376,02e3ff7c929f456e23ed040dcf8ecb96d839a8ab8e5c54...,"[579541001, 736581001, 747984001, 621048001, 7...",0
19107,05561c7f76c3b32876132742f9ba1241e46f85794ed3f6...,"[722436001, 764228001, 747939002, 702623001, 8...",0
36701,0a426bf0099f7a569705c1daa7caa9a127dce4e7d697c1...,"[889392001, 831429001, 836262001, 708489001, 8...",0
...,...,...,...
879686,f61ab1706f224d74314fc0f28f32acdcd312beaca7e58f...,"[805275001, 854301001, 797892001, 817353002, 8...",0
889076,f8bc7d9127afea242b1ada45cd11f72db30104809e5c28...,"[700835001, 565379001, 693479001, 572797001, 7...",1
905556,fd5681ce4ca1bedd35be242604fee3af97f4f69bf3d32f...,"[665509002, 504155001, 687921001, 691546002, 6...",1
911338,fef28dff7502ee38833fddc4ce2c74a31f8fd744b1ff2f...,"[589599001, 794575001, 762205001, 720125001, 8...",1


In [44]:
# Inspection of the amount of customers who purchased
sum_exist_test = df['exist_test'].sum()
sum_exist_test

50

In [45]:
#Calculation of the image of all the products purchased by a customer by dimension in 4 methods,
# according to the average, the median, the maximum and the 75th percentile of each customer's products

def calculate_similarity(customer_data, similarity_data, op):
    # Initialize lists to store the results
    customer_id_list = []
    description_emb_similarity_list = []
    images_df_similarity_list = []
    processed_articles_similarity_list = []
    average_similarity_list = []

    # Iterate over each customer in the customer_data DataFrame
    for index, row in customer_data.iterrows():
        customer_id = row['customer_id']  # Get the customer ID
        article_ids = row['unique_article_ids']  # Get the list of article IDs associated with the customer

        # Filter and compute the similarity metric based on the operation (mean, median, max, 75th percentile)
        if op == "mean":
            relevant_similarities = similarity_data.loc[article_ids].mean()
        elif op == "median":
            relevant_similarities = similarity_data.loc[article_ids].median()
        elif op == "max":
            relevant_similarities = similarity_data.loc[article_ids].max()
        elif op == "75th_percentile":
            relevant_similarities = similarity_data.loc[article_ids].quantile(0.75)

        # Append the results to the corresponding lists
        customer_id_list.append(customer_id)
        description_emb_similarity_list.append(relevant_similarities['description_emb_similarity'])
        images_df_similarity_list.append(relevant_similarities['images_df_similarity'])
        processed_articles_similarity_list.append(relevant_similarities['processed_articles_similarity'])
        average_similarity_list.append(relevant_similarities['average_similarity'])

    # Create a new DataFrame from the results and return it
    return pd.DataFrame({
        'customer_id': customer_id_list,
        'description_emb_similarity': description_emb_similarity_list,
        'images_df_similarity': images_df_similarity_list,
        'processed_articles_similarity': processed_articles_similarity_list,
        'average_similarity': average_similarity_list
    })


In [46]:
result_df_mean = calculate_similarity(df, df_merged,"mean")
result_df_with_exist_test_mean = result_df_mean.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

result_df_max = calculate_similarity(df, df_merged,"max")
result_df_with_exist_test_max = result_df_max.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

result_df_median = calculate_similarity(df, df_merged,"median")
result_df_with_exist_test_median = result_df_median.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

result_df_75th_percentile = calculate_similarity(df, df_merged,"75th_percentile")
result_df_with_exist_test_75th_percentile = result_df_75th_percentile.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

# Creating binary success indicators (if the similarity exceeds the median - 1, otherwise 0) and calculating success indicators - lift, recall, precision

In [47]:
# Converting the dimensions to binary according to the median of each
def convert_to_binary(df2):
  df = df2.copy()
  avg_description_emb = df['description_emb_similarity'].median()
  avg_images_df = df['images_df_similarity'].median()
  avg_processed_articles = df['processed_articles_similarity'].median()
  avg_average_similarity = df['average_similarity'].median()

  # Apply the logic to assign 0 or 1 based on median
  df['description_emb_similarity'] = np.where(df['description_emb_similarity'] > avg_description_emb, 1, 0)
  df['images_df_similarity'] = np.where(df['images_df_similarity'] > avg_images_df, 1, 0)
  df['processed_articles_similarity'] = np.where(df['processed_articles_similarity'] > avg_processed_articles, 1, 0)
  df['average_similarity'] = np.where(df['average_similarity'] > avg_average_similarity, 1, 0)

  return df

In [48]:
def calculate_lift_binary(df):
  results = {}
  columns = ['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity', 'average_similarity']

  # Calculate precision, recall, and lift for each similarity column
  for column in columns:
      precision = precision_score(df['exist_test'], df[column])
      recall = recall_score(df['exist_test'], df[column])

      # Calculate prevalence
      prevalence = df['exist_test'].mean()

      # Calculate lift
      lift = precision / prevalence

      results[column] = {'precision': precision, 'recall': recall, 'lift': lift}

  # Display the results as a DataFrame
  pre_rec_lift = pd.DataFrame(results).T
  return pre_rec_lift

In [49]:
# Calculate the average similarity for each customer
result_df_mean = calculate_similarity(df, df_merged,"mean")
result_df_with_exist_test_mean = result_df_mean.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')
result_df_with_exist_test_mean

Unnamed: 0,customer_id,description_emb_similarity,images_df_similarity,processed_articles_similarity,average_similarity,exist_test
0,0161b094e87ecd811ace003a01222068afe8393e93a46c...,0.310504,0.479027,0.125000,0.304843,0
1,01cc53c3fa779e7eea61e6fa87745a6707e4f87c9b3b8c...,0.417000,0.436638,0.375000,0.409546,0
2,02e3ff7c929f456e23ed040dcf8ecb96d839a8ab8e5c54...,0.488297,0.400599,0.350000,0.412965,0
3,05561c7f76c3b32876132742f9ba1241e46f85794ed3f6...,0.418434,0.420742,0.300000,0.379725,0
4,0a426bf0099f7a569705c1daa7caa9a127dce4e7d697c1...,0.422781,0.487378,0.200000,0.370053,0
...,...,...,...,...,...,...
95,f61ab1706f224d74314fc0f28f32acdcd312beaca7e58f...,0.399070,0.294189,0.325000,0.339420,0
96,f8bc7d9127afea242b1ada45cd11f72db30104809e5c28...,0.533460,0.485698,0.500000,0.506386,1
97,fd5681ce4ca1bedd35be242604fee3af97f4f69bf3d32f...,0.435976,0.418300,0.277778,0.377351,1
98,fef28dff7502ee38833fddc4ce2c74a31f8fd744b1ff2f...,0.378313,0.471020,0.305556,0.384963,1


In [50]:
# Calculate the max similarity for each customer
result_df_max = calculate_similarity(df, df_merged,"max")
result_df_with_exist_test_max = result_df_max.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')
result_df_with_exist_test_max

Unnamed: 0,customer_id,description_emb_similarity,images_df_similarity,processed_articles_similarity,average_similarity,exist_test
0,0161b094e87ecd811ace003a01222068afe8393e93a46c...,0.634167,0.767821,0.25,0.497857,0
1,01cc53c3fa779e7eea61e6fa87745a6707e4f87c9b3b8c...,0.591643,0.728956,0.75,0.676291,0
2,02e3ff7c929f456e23ed040dcf8ecb96d839a8ab8e5c54...,0.556358,0.685889,1.00,0.539422,0
3,05561c7f76c3b32876132742f9ba1241e46f85794ed3f6...,0.607517,0.583637,0.75,0.533380,0
4,0a426bf0099f7a569705c1daa7caa9a127dce4e7d697c1...,0.497328,0.628517,0.50,0.493134,0
...,...,...,...,...,...,...
95,f61ab1706f224d74314fc0f28f32acdcd312beaca7e58f...,0.570252,0.510574,0.50,0.494992,0
96,f8bc7d9127afea242b1ada45cd11f72db30104809e5c28...,0.904380,0.863178,0.75,0.839186,1
97,fd5681ce4ca1bedd35be242604fee3af97f4f69bf3d32f...,0.726728,0.602837,0.50,0.561241,1
98,fef28dff7502ee38833fddc4ce2c74a31f8fd744b1ff2f...,0.515231,0.577349,0.75,0.567189,1


In [51]:
# Calculate the median similarity for each customer
result_df_median = calculate_similarity(df, df_merged,"median")
result_df_with_exist_test_median = result_df_median.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')
result_df_with_exist_test_median

Unnamed: 0,customer_id,description_emb_similarity,images_df_similarity,processed_articles_similarity,average_similarity,exist_test
0,0161b094e87ecd811ace003a01222068afe8393e93a46c...,0.257787,0.459383,0.125,0.314190,0
1,01cc53c3fa779e7eea61e6fa87745a6707e4f87c9b3b8c...,0.437090,0.442547,0.500,0.421293,0
2,02e3ff7c929f456e23ed040dcf8ecb96d839a8ab8e5c54...,0.507011,0.381109,0.250,0.429065,0
3,05561c7f76c3b32876132742f9ba1241e46f85794ed3f6...,0.425485,0.412017,0.250,0.375486,0
4,0a426bf0099f7a569705c1daa7caa9a127dce4e7d697c1...,0.435076,0.511622,0.250,0.389313,0
...,...,...,...,...,...,...
95,f61ab1706f224d74314fc0f28f32acdcd312beaca7e58f...,0.395718,0.281539,0.250,0.362458,0
96,f8bc7d9127afea242b1ada45cd11f72db30104809e5c28...,0.478304,0.453998,0.500,0.446021,1
97,fd5681ce4ca1bedd35be242604fee3af97f4f69bf3d32f...,0.414144,0.449292,0.250,0.376093,1
98,fef28dff7502ee38833fddc4ce2c74a31f8fd744b1ff2f...,0.374219,0.439929,0.250,0.372006,1


In [52]:
# Calculate the median similarity for each customer
result_df_75th_percentile = calculate_similarity(df, df_merged,"75th_percentile")
result_df_with_exist_test_75th_percentile = result_df_75th_percentile.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')
result_df_with_exist_test_75th_percentile

Unnamed: 0,customer_id,description_emb_similarity,images_df_similarity,processed_articles_similarity,average_similarity,exist_test
0,0161b094e87ecd811ace003a01222068afe8393e93a46c...,0.437801,0.526829,0.2500,0.362817,0
1,01cc53c3fa779e7eea61e6fa87745a6707e4f87c9b3b8c...,0.496665,0.567530,0.5000,0.530038,0
2,02e3ff7c929f456e23ed040dcf8ecb96d839a8ab8e5c54...,0.547174,0.477587,0.5000,0.460786,0
3,05561c7f76c3b32876132742f9ba1241e46f85794ed3f6...,0.485110,0.455882,0.4375,0.460921,0
4,0a426bf0099f7a569705c1daa7caa9a127dce4e7d697c1...,0.490619,0.572165,0.2500,0.411128,0
...,...,...,...,...,...,...
95,f61ab1706f224d74314fc0f28f32acdcd312beaca7e58f...,0.511791,0.458964,0.4375,0.401749,0
96,f8bc7d9127afea242b1ada45cd11f72db30104809e5c28...,0.672001,0.576839,0.7500,0.536915,1
97,fd5681ce4ca1bedd35be242604fee3af97f4f69bf3d32f...,0.580887,0.506809,0.2500,0.419830,1
98,fef28dff7502ee38833fddc4ce2c74a31f8fd744b1ff2f...,0.473472,0.535600,0.5000,0.427076,1


In [53]:
binary_result_df_with_exist_test_mean = convert_to_binary(result_df_with_exist_test_mean)
binary_result_df_with_exist_test_max = convert_to_binary(result_df_with_exist_test_max)
binary_result_df_with_exist_test_median = convert_to_binary(result_df_with_exist_test_median)
binary_result_df_with_exist_test_75th_percentile = convert_to_binary(result_df_with_exist_test_75th_percentile)

# Displaying the results for each method - average, median, 75th percentile and maximum

In [54]:
lift_mean = calculate_lift_binary(binary_result_df_with_exist_test_mean)
lift_mean

Unnamed: 0,precision,recall,lift
description_emb_similarity,0.54,0.54,1.08
images_df_similarity,0.6,0.6,1.2
processed_articles_similarity,0.62,0.62,1.24
average_similarity,0.6,0.6,1.2


In [55]:
lift_max = calculate_lift_binary(binary_result_df_with_exist_test_max)
lift_max

Unnamed: 0,precision,recall,lift
description_emb_similarity,0.56,0.56,1.12
images_df_similarity,0.62,0.62,1.24
processed_articles_similarity,0.6,0.12,1.2
average_similarity,0.62,0.62,1.24


In [56]:
lift_median = calculate_lift_binary(binary_result_df_with_exist_test_median)
lift_median

Unnamed: 0,precision,recall,lift
description_emb_similarity,0.48,0.48,0.96
images_df_similarity,0.56,0.56,1.12
processed_articles_similarity,0.5,0.3,1.0
average_similarity,0.52,0.52,1.04


In [57]:
lift_75th_percentile = calculate_lift_binary(binary_result_df_with_exist_test_75th_percentile)
lift_75th_percentile

Unnamed: 0,precision,recall,lift
description_emb_similarity,0.6,0.6,1.2
images_df_similarity,0.6,0.6,1.2
processed_articles_similarity,1.0,0.16,2.0
average_similarity,0.54,0.54,1.08


# Running 5 models for all similarities of each dimension separately + the average of the similarities

# SVC Model:

In [58]:
from sklearn.svm import SVC

def svc_model(df, feature):
    X = df[[feature]]
    y = df['exist_test']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    svc_model = SVC(kernel='poly')
    svc_model.fit(X_train, y_train)

    y_pred = svc_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

def evaluate_model_over_runs(df, feature, num_runs=10):
    accuracies = []
    for _ in range(num_runs):
        accuracy = svc_model(df, feature)
        accuracies.append(accuracy)
    average_accuracy = np.mean(accuracies)
    return average_accuracy

# Assume result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile are defined
features = ['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity', 'average_similarity']
datasets = [result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile]
dataset_names = ['Mean', 'Max', 'Median', '75th Percentile']

results = pd.DataFrame(index=dataset_names, columns=features)

for feature in features:
    for dataset, name in zip(datasets, dataset_names):
        avg_acc = evaluate_model_over_runs(dataset, feature)
        results.at[name, feature] = avg_acc

results

Unnamed: 0,description_emb_similarity,images_df_similarity,processed_articles_similarity,average_similarity
Mean,0.575,0.67,0.6,0.655
Max,0.475,0.58,0.515,0.63
Median,0.49,0.615,0.505,0.535
75th Percentile,0.635,0.595,0.615,0.63


# Logistic regression Model:

In [59]:
from sklearn.linear_model import LogisticRegression

def logistic_regression_model(df, feature):
    X = df[[feature]]
    y = df['exist_test']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    logreg_model = LogisticRegression()
    logreg_model.fit(X_train, y_train)

    y_pred_logreg = logreg_model.predict(X_test)

    accuracy_logreg = accuracy_score(y_test, y_pred_logreg)

    return accuracy_logreg

def evaluate_logreg_model_over_runs(df, feature, num_runs=10):
    accuracies = []
    for _ in range(num_runs):
        accuracy = logistic_regression_model(df, feature)
        accuracies.append(accuracy)
    average_accuracy = np.mean(accuracies)
    return average_accuracy

# Assume result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile are defined
features = ['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity', 'average_similarity']
datasets = [result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile]
dataset_names = ['Mean', 'Max', 'Median', '75th Percentile']

logreg_results = pd.DataFrame(index=dataset_names, columns=features)

for feature in features:
    for dataset, name in zip(datasets, dataset_names):
        avg_acc = evaluate_logreg_model_over_runs(dataset, feature)
        logreg_results.at[name, feature] = avg_acc

logreg_results

Unnamed: 0,description_emb_similarity,images_df_similarity,processed_articles_similarity,average_similarity
Mean,0.56,0.62,0.635,0.635
Max,0.555,0.635,0.66,0.625
Median,0.5,0.53,0.5,0.56
75th Percentile,0.55,0.495,0.485,0.525


# Random Forest Classifier Model

In [60]:
from sklearn.ensemble import RandomForestClassifier

def random_forest_model(df, feature):
    X = df[[feature]]
    y = df['exist_test']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    rf_model = RandomForestClassifier()
    rf_model.fit(X_train, y_train)

    y_pred_rf = rf_model.predict(X_test)

    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    return accuracy_rf

def evaluate_rf_model_over_runs(df, feature, num_runs=10):
    accuracies = []
    for _ in range(num_runs):
        accuracy = random_forest_model(df, feature)
        accuracies.append(accuracy)
    average_accuracy = np.mean(accuracies)
    return average_accuracy

# Assume result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile are defined
features = ['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity', 'average_similarity']
datasets = [result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile]
dataset_names = ['Mean', 'Max', 'Median', '75th Percentile']

rf_results = pd.DataFrame(index=dataset_names, columns=features)

for feature in features:
    for dataset, name in zip(datasets, dataset_names):
        avg_acc = evaluate_rf_model_over_runs(dataset, feature)
        rf_results.at[name, feature] = avg_acc

rf_results

Unnamed: 0,description_emb_similarity,images_df_similarity,processed_articles_similarity,average_similarity
Mean,0.465,0.46,0.895,0.56
Max,0.52,0.49,0.61,0.555
Median,0.52,0.49,0.51,0.415
75th Percentile,0.55,0.65,0.49,0.555


# XGB Classifier Model

In [61]:
from xgboost import XGBClassifier

def xgboost_model(df, feature):
    X = df[[feature]]
    y = df['exist_test']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    xgb_model = XGBClassifier()
    xgb_model.fit(X_train, y_train)

    y_pred_xgb = xgb_model.predict(X_test)

    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
    return accuracy_xgb

def evaluate_xgb_model_over_runs(df, feature, num_runs=10):
    accuracies = []
    for _ in range(num_runs):
        accuracy = xgboost_model(df, feature)
        accuracies.append(accuracy)
    average_accuracy = np.mean(accuracies)
    return average_accuracy

# Assume result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile are defined
features = ['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity', 'average_similarity']
datasets = [result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile]
dataset_names = ['Mean', 'Max', 'Median', '75th Percentile']

xgb_results = pd.DataFrame(index=dataset_names, columns=features)

for feature in features:
    for dataset, name in zip(datasets, dataset_names):
        avg_acc = evaluate_xgb_model_over_runs(dataset, feature)
        xgb_results.at[name, feature] = avg_acc

xgb_results

Unnamed: 0,description_emb_similarity,images_df_similarity,processed_articles_similarity,average_similarity
Mean,0.5,0.545,0.855,0.57
Max,0.445,0.52,0.57,0.575
Median,0.485,0.52,0.535,0.455
75th Percentile,0.595,0.565,0.515,0.585


# K Neighbors Classifier Model

In [62]:
from sklearn.neighbors import KNeighborsClassifier

def knn_model(df, feature):
    X = df[[feature]]
    y = df['exist_test']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    knn_model = KNeighborsClassifier()
    knn_model.fit(X_train, y_train)

    y_pred_knn = knn_model.predict(X_test)

    accuracy_knn = accuracy_score(y_test, y_pred_knn)
    return accuracy_knn

def evaluate_knn_model_over_runs(df, feature, num_runs=10):
    accuracies = []
    for _ in range(num_runs):
        accuracy = knn_model(df, feature)
        accuracies.append(accuracy)
    average_accuracy = np.mean(accuracies)
    return average_accuracy

# Assume result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile are defined
features = ['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity', 'average_similarity']
datasets = [result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile]
dataset_names = ['Mean', 'Max', 'Median', '75th Percentile']

knn_results = pd.DataFrame(index=dataset_names, columns=features)

for feature in features:
    for dataset, name in zip(datasets, dataset_names):
        avg_acc = evaluate_knn_model_over_runs(dataset, feature)
        knn_results.at[name, feature] = avg_acc

knn_results

Unnamed: 0,description_emb_similarity,images_df_similarity,processed_articles_similarity,average_similarity
Mean,0.53,0.57,0.58,0.525
Max,0.445,0.58,0.545,0.6
Median,0.54,0.46,0.475,0.425
75th Percentile,0.6,0.61,0.58,0.58


In [63]:
data = results.copy()
df = pd.DataFrame(data, index=['Mean', 'Max', 'Median', '75th Percentile'])
df.reset_index(inplace=True)
df.rename(columns={'index': 'data'}, inplace=True)
df["Model"] = "SVC"
results = df.copy()


data = logreg_results.copy()
df = pd.DataFrame(data, index=['Mean', 'Max', 'Median', '75th Percentile'])
df.reset_index(inplace=True)
df.rename(columns={'index': 'data'}, inplace=True)
df["Model"] = "LR"
logreg_results = df.copy()


data = rf_results.copy()
df = pd.DataFrame(data, index=['Mean', 'Max', 'Median', '75th Percentile'])
df.reset_index(inplace=True)
df.rename(columns={'index': 'data'}, inplace=True)
df["Model"] = "RF"
rf_results = df.copy()


data = xgb_results.copy()
df = pd.DataFrame(data, index=['Mean', 'Max', 'Median', '75th Percentile'])
df.reset_index(inplace=True)
df.rename(columns={'index': 'data'}, inplace=True)
df["Model"] = "XGB"
xgb_results = df.copy()



data = knn_results.copy()
df = pd.DataFrame(data, index=['Mean', 'Max', 'Median', '75th Percentile'])
df.reset_index(inplace=True)
df.rename(columns={'index': 'data'}, inplace=True)
df["Model"] = "KNN"
knn_results = df.copy()


# Preparation and export of the results of the binary calculation and the models

In [64]:
# Concatenate all the DataFrames
combined_df = pd.concat([results, logreg_results, rf_results, xgb_results, knn_results], axis=0)

# Reset index to make it more structured
combined_df.reset_index(inplace=True, drop=True)

# Reorder columns to have "Title" first
combined_df = combined_df[["Model"] + list(combined_df.columns[:-1])]

lift_mean["Lift_Type"] = "Mean"
lift_max["Lift_Type"] = "Max"
lift_median["Lift_Type"] = "Median"
lift_75th_percentile["Lift_Type"] = "75th Percentile"

# Combine all the lift DataFrames
df = pd.concat([lift_mean, lift_max, lift_median, lift_75th_percentile], axis=0)
df['dim'] = df.index
combined_lift_df = df[["Lift_Type", "dim","precision", "recall", "lift"]]

In [65]:
# Mount Google Drive
drive.mount('/content/gdrive')

# Define the path to save the CSV file
path = "/content/gdrive/MyDrive/" + str(test_article_id) + "combined_results.csv"

# Export the combined results
combined_df.to_csv(path, index=False)

# Append the lift results to the same CSV file
combined_lift_df.to_csv(path, mode='a', index=False)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# To run the code on hundreds of products - we put all the commands into a function and ran in a loop on the list we selected, In the rest of the notebook, the code appears individually for one product - not within a function

In [66]:
"""
def func(number):
  import pandas as pd
  import numpy as np
  from numpy.linalg import norm
  import random
  import datetime as dt
  from sklearn.metrics.pairwise import cosine_similarity
  import itertools
  from sklearn.metrics import precision_score, recall_score
  import ast
  from google.colab import drive

  from sklearn.model_selection import train_test_split
  from sklearn.svm import SVC
  from sklearn.metrics import accuracy_score, classification_report


  path = '/content/gdrive/MyDrive/'

  drive.mount('/content/gdrive')
  test_article_id = number
  number = str(number)+"_"
  filtered_description_emb = pd.read_csv(path + number + 'filtered_description_emb.csv', index_col=0)
  filtered_description_emb2 = pd.read_csv(path + number + 'filtered_description_emb2.csv', index_col=0)
  #filtered_description_emb2_avg_vector = pd.read_csv(path + number + 'filtered_description_emb2_avg_vector.csv', index_col=0)

  filtered_images_df = pd.read_csv(path + number + 'filtered_images_df.csv', index_col=0)
  filtered_images_df2 = pd.read_csv(path + number + 'filtered_images_df2.csv', index_col=0)
  #filtered_images_df2_avg_vector = pd.read_csv(path + number + 'filtered_images_df2_avg_vector.csv', index_col=0)

  filtered_processed_articles = pd.read_csv(path + number + 'filtered_processed_articles.csv', index_col=0)
  filtered_processed_articles2 = pd.read_csv(path + number + 'filtered_processed_articles2.csv', index_col=0)
  #filtered_processed_articles2_avg_vector = pd.read_csv(path + number + 'filtered_processed_articles2_avg_vector.csv', index_col=0)

  filtered_grouped_data = pd.read_csv(path + number + 'filtered_grouped_data.csv', index_col=0)

  vec2 = filtered_description_emb2.iloc[0].values

  # Define a function to compute cosine similarity
  def cosine_similarity(vec1, vec2):
      dot_product = np.dot(vec1, vec2)
      norm_vec1 = np.linalg.norm(vec1)
      norm_vec2 = np.linalg.norm(vec2)
      return dot_product / (norm_vec1 * norm_vec2)

  # Compute cosine similarity for each row in filtered_description_emb
  similarities_description_emb = filtered_description_emb.apply(lambda row: cosine_similarity(row.values, vec2), axis=1)

  #### scaler
  min_value = similarities_description_emb.min()
  max_value = similarities_description_emb.max()
  similarities_description_emb = (similarities_description_emb - min_value) / (max_value - min_value)



  vec2 = filtered_images_df2.iloc[0].values

  # Define a function to compute cosine similarity
  def cosine_similarity(vec1, vec2):
      dot_product = np.dot(vec1, vec2)
      norm_vec1 = np.linalg.norm(vec1)
      norm_vec2 = np.linalg.norm(vec2)
      return dot_product / (norm_vec1 * norm_vec2)

  # Compute cosine similarity for each row in filtered_description_emb
  similarities_images_df = filtered_images_df.apply(lambda row: cosine_similarity(row.values, vec2), axis=1)

  #### scaler
  min_value = similarities_images_df.min()
  max_value = similarities_images_df.max()
  similarities_images_df = (similarities_images_df - min_value) / (max_value - min_value)
  ####



  # Extract the single vector from filtered_description_emb2
  vec2 = filtered_processed_articles2.iloc[0].values

  # Define a function to compute cosine similarity
  def cosine_similarity(vec1, vec2):
      dot_product = np.dot(vec1, vec2)
      norm_vec1 = np.linalg.norm(vec1)
      norm_vec2 = np.linalg.norm(vec2)
      return dot_product / (norm_vec1 * norm_vec2)

  # Compute cosine similarity for each row in filtered_description_emb
  similarities_processed_articles = filtered_processed_articles.apply(lambda row: cosine_similarity(row.values, vec2), axis=1)

  #### scaler
  min_value = similarities_processed_articles.min()
  max_value = similarities_processed_articles.max()
  similarities_processed_articles = (similarities_processed_articles - min_value) / (max_value - min_value)
  ####


  df_desc_emb = pd.DataFrame(similarities_description_emb)
  df_images = pd.DataFrame(similarities_images_df)
  df_articles = pd.DataFrame(similarities_processed_articles)

  # Rename columns for consistency
  df_articles.index.name = 'product_id'

  # Merge the dataframes on 'product_id'
  df_merged = pd.merge(df_desc_emb, df_images, on='product_id', how='outer')
  df_merged = pd.merge(df_merged, df_articles, on='product_id', how='outer')

  df_merged = df_merged.rename(columns={'0_x': 'description_emb_similarity', '0_y': 'images_df_similarity', 0: 'processed_articles_similarity'})

  # Calculate the average similarity
  df_merged['average_similarity'] = df_merged[['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity']].mean(axis=1)

  # Display the final dataframe
  df_merged


  df = pd.DataFrame(filtered_grouped_data).copy()
  df['unique_article_ids'] = df['unique_article_ids'].apply(ast.literal_eval)

  # Function to remove test_article_id from unique_article_ids and check existence
  def remove_and_check(row):
        if test_article_id in row['unique_article_ids']:
            row['unique_article_ids'].remove(test_article_id)
            row['exist_test'] = 1
        else:
            row['exist_test'] = 0
        return row

  # Apply the function to each row
  df = df.apply(remove_and_check, axis=1)


  def calculate_similarity(customer_data, similarity_data, op):
      customer_id_list = []
      description_emb_similarity_list = []
      images_df_similarity_list = []
      processed_articles_similarity_list = []
      average_similarity_list = []

      for index, row in customer_data.iterrows():
          customer_id = row['customer_id']
          article_ids = row['unique_article_ids']

          # Filter out the similarity data for the given article ids
          if op == "mean":
              relevant_similarities = similarity_data.loc[article_ids].mean()
          elif op == "median":
              relevant_similarities = similarity_data.loc[article_ids].median()
          elif op == "max":
              relevant_similarities = similarity_data.loc[article_ids].max()
          elif op == "75th_percentile":
              relevant_similarities = similarity_data.loc[article_ids].quantile(0.75)

          customer_id_list.append(customer_id)
          description_emb_similarity_list.append(relevant_similarities['description_emb_similarity'])
          images_df_similarity_list.append(relevant_similarities['images_df_similarity'])
          processed_articles_similarity_list.append(relevant_similarities['processed_articles_similarity'])
          average_similarity_list.append(relevant_similarities['average_similarity'])

      return pd.DataFrame({
          'customer_id': customer_id_list,
          'description_emb_similarity': description_emb_similarity_list,
          'images_df_similarity': images_df_similarity_list,
          'processed_articles_similarity': processed_articles_similarity_list,
          'average_similarity': average_similarity_list
      })

  result_df_mean = calculate_similarity(df, df_merged,"mean")
  result_df_with_exist_test_mean = result_df_mean.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

  result_df_max = calculate_similarity(df, df_merged,"max")
  result_df_with_exist_test_max = result_df_max.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

  result_df_median = calculate_similarity(df, df_merged,"median")
  result_df_with_exist_test_median = result_df_median.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

  result_df_75th_percentile = calculate_similarity(df, df_merged,"75th_percentile")
  result_df_with_exist_test_75th_percentile = result_df_75th_percentile.merge(df[['customer_id', 'exist_test']], on='customer_id', how='left')

  def convert_to_binary(df2):
    df = df2.copy()
    avg_description_emb = df['description_emb_similarity'].median()
    avg_images_df = df['images_df_similarity'].median()
    avg_processed_articles = df['processed_articles_similarity'].median()
    avg_average_similarity = df['average_similarity'].median()

    # Apply the logic to assign 0 or 1 based on average
    df['description_emb_similarity'] = np.where(df['description_emb_similarity'] > avg_description_emb, 1, 0)
    df['images_df_similarity'] = np.where(df['images_df_similarity'] > avg_images_df, 1, 0)
    df['processed_articles_similarity'] = np.where(df['processed_articles_similarity'] > avg_processed_articles, 1, 0)
    df['average_similarity'] = np.where(df['average_similarity'] > avg_average_similarity, 1, 0)

    return df

  def calculate_lift_binary(df):
    results = {}
    columns = ['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity', 'average_similarity']

    # Calculate precision, recall, and lift for each similarity column
    for column in columns:
        precision = precision_score(df['exist_test'], df[column])
        recall = recall_score(df['exist_test'], df[column])

        # Calculate prevalence
        prevalence = df['exist_test'].mean()

        # Calculate lift
        lift = precision / prevalence

        results[column] = {'precision': precision, 'recall': recall, 'lift': lift}

    # Display the results as a DataFrame
    pre_rec_lift = pd.DataFrame(results).T
    return pre_rec_lift

  binary_result_df_with_exist_test_mean = convert_to_binary(result_df_with_exist_test_mean)
  binary_result_df_with_exist_test_max = convert_to_binary(result_df_with_exist_test_max)
  binary_result_df_with_exist_test_median = convert_to_binary(result_df_with_exist_test_median)
  binary_result_df_with_exist_test_75th_percentile = convert_to_binary(result_df_with_exist_test_75th_percentile)


  lift_mean = calculate_lift_binary(binary_result_df_with_exist_test_mean)
  lift_max = calculate_lift_binary(binary_result_df_with_exist_test_max)
  lift_median = calculate_lift_binary(binary_result_df_with_exist_test_median)
  lift_75th_percentile = calculate_lift_binary(binary_result_df_with_exist_test_75th_percentile)


  import pandas as pd
  from sklearn.model_selection import train_test_split
  from sklearn.svm import SVC
  from sklearn.metrics import accuracy_score
  import numpy as np

  def svc_model(df, feature):
      X = df[[feature]]
      y = df['exist_test']

      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

      svc_model = SVC(kernel='poly')
      svc_model.fit(X_train, y_train)

      y_pred = svc_model.predict(X_test)

      accuracy = accuracy_score(y_test, y_pred)
      return accuracy

  def evaluate_model_over_runs(df, feature, num_runs=10):
      accuracies = []
      for _ in range(num_runs):
          accuracy = svc_model(df, feature)
          accuracies.append(accuracy)
      average_accuracy = np.mean(accuracies)
      return average_accuracy

  # Assume result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile are defined
  features = ['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity', 'average_similarity']
  datasets = [result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile]
  dataset_names = ['Mean', 'Max', 'Median', '75th Percentile']

  results = pd.DataFrame(index=dataset_names, columns=features)

  for feature in features:
      for dataset, name in zip(datasets, dataset_names):
          avg_acc = evaluate_model_over_runs(dataset, feature)
          results.at[name, feature] = avg_acc

  import pandas as pd
  from sklearn.model_selection import train_test_split
  from sklearn.linear_model import LogisticRegression
  from sklearn.metrics import accuracy_score
  import numpy as np

  def logistic_regression_model(df, feature):
      X = df[[feature]]
      y = df['exist_test']

      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

      logreg_model = LogisticRegression()
      logreg_model.fit(X_train, y_train)

      y_pred_logreg = logreg_model.predict(X_test)

      accuracy_logreg = accuracy_score(y_test, y_pred_logreg)

      return accuracy_logreg

  def evaluate_logreg_model_over_runs(df, feature, num_runs=10):
      accuracies = []
      for _ in range(num_runs):
          accuracy = logistic_regression_model(df, feature)
          accuracies.append(accuracy)
      average_accuracy = np.mean(accuracies)
      return average_accuracy

  # Assume result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile are defined
  features = ['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity', 'average_similarity']
  datasets = [result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile]
  dataset_names = ['Mean', 'Max', 'Median', '75th Percentile']

  logreg_results = pd.DataFrame(index=dataset_names, columns=features)

  for feature in features:
      for dataset, name in zip(datasets, dataset_names):
          avg_acc = evaluate_logreg_model_over_runs(dataset, feature)
          logreg_results.at[name, feature] = avg_acc

  from sklearn.ensemble import RandomForestClassifier
  from sklearn.model_selection import train_test_split
  from sklearn.metrics import accuracy_score
  import pandas as pd
  import numpy as np

  def random_forest_model(df, feature):
      X = df[[feature]]
      y = df['exist_test']

      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

      rf_model = RandomForestClassifier()
      rf_model.fit(X_train, y_train)

      y_pred_rf = rf_model.predict(X_test)

      accuracy_rf = accuracy_score(y_test, y_pred_rf)
      return accuracy_rf

  def evaluate_rf_model_over_runs(df, feature, num_runs=10):
      accuracies = []
      for _ in range(num_runs):
          accuracy = random_forest_model(df, feature)
          accuracies.append(accuracy)
      average_accuracy = np.mean(accuracies)
      return average_accuracy

  # Assume result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile are defined
  features = ['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity', 'average_similarity']
  datasets = [result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile]
  dataset_names = ['Mean', 'Max', 'Median', '75th Percentile']

  rf_results = pd.DataFrame(index=dataset_names, columns=features)

  for feature in features:
      for dataset, name in zip(datasets, dataset_names):
          avg_acc = evaluate_rf_model_over_runs(dataset, feature)
          rf_results.at[name, feature] = avg_acc

  from xgboost import XGBClassifier
  from sklearn.model_selection import train_test_split
  from sklearn.metrics import accuracy_score
  import pandas as pd
  import numpy as np

  def xgboost_model(df, feature):
      X = df[[feature]]
      y = df['exist_test']

      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

      xgb_model = XGBClassifier()
      xgb_model.fit(X_train, y_train)

      y_pred_xgb = xgb_model.predict(X_test)

      accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
      return accuracy_xgb

  def evaluate_xgb_model_over_runs(df, feature, num_runs=10):
      accuracies = []
      for _ in range(num_runs):
          accuracy = xgboost_model(df, feature)
          accuracies.append(accuracy)
      average_accuracy = np.mean(accuracies)
      return average_accuracy

  # Assume result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile are defined
  features = ['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity', 'average_similarity']
  datasets = [result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile]
  dataset_names = ['Mean', 'Max', 'Median', '75th Percentile']

  xgb_results = pd.DataFrame(index=dataset_names, columns=features)

  for feature in features:
      for dataset, name in zip(datasets, dataset_names):
          avg_acc = evaluate_xgb_model_over_runs(dataset, feature)
          xgb_results.at[name, feature] = avg_acc

  from sklearn.neighbors import KNeighborsClassifier
  from sklearn.model_selection import train_test_split
  from sklearn.metrics import accuracy_score
  import pandas as pd
  import numpy as np

  def knn_model(df, feature):
      X = df[[feature]]
      y = df['exist_test']

      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

      knn_model = KNeighborsClassifier()
      knn_model.fit(X_train, y_train)

      y_pred_knn = knn_model.predict(X_test)

      accuracy_knn = accuracy_score(y_test, y_pred_knn)
      return accuracy_knn

  def evaluate_knn_model_over_runs(df, feature, num_runs=10):
      accuracies = []
      for _ in range(num_runs):
          accuracy = knn_model(df, feature)
          accuracies.append(accuracy)
      average_accuracy = np.mean(accuracies)
      return average_accuracy

  # Assume result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile are defined
  features = ['description_emb_similarity', 'images_df_similarity', 'processed_articles_similarity', 'average_similarity']
  datasets = [result_df_with_exist_test_mean, result_df_with_exist_test_max, result_df_with_exist_test_median, result_df_with_exist_test_75th_percentile]
  dataset_names = ['Mean', 'Max', 'Median', '75th Percentile']

  knn_results = pd.DataFrame(index=dataset_names, columns=features)

  for feature in features:
      for dataset, name in zip(datasets, dataset_names):
          avg_acc = evaluate_knn_model_over_runs(dataset, feature)
          knn_results.at[name, feature] = avg_acc

  data = results.copy()
  df = pd.DataFrame(data, index=['Mean', 'Max', 'Median', '75th Percentile'])
  df.reset_index(inplace=True)
  df.rename(columns={'index': 'data'}, inplace=True)
  df["Model"] = "SVC"
  results = df.copy()


  data = logreg_results.copy()
  df = pd.DataFrame(data, index=['Mean', 'Max', 'Median', '75th Percentile'])
  df.reset_index(inplace=True)
  df.rename(columns={'index': 'data'}, inplace=True)
  df["Model"] = "LR"
  logreg_results = df.copy()


  data = rf_results.copy()
  df = pd.DataFrame(data, index=['Mean', 'Max', 'Median', '75th Percentile'])
  df.reset_index(inplace=True)
  df.rename(columns={'index': 'data'}, inplace=True)
  df["Model"] = "RF"
  rf_results = df.copy()


  data = xgb_results.copy()
  df = pd.DataFrame(data, index=['Mean', 'Max', 'Median', '75th Percentile'])
  df.reset_index(inplace=True)
  df.rename(columns={'index': 'data'}, inplace=True)
  df["Model"] = "XGB"
  xgb_results = df.copy()


  data = knn_results.copy()
  df = pd.DataFrame(data, index=['Mean', 'Max', 'Median', '75th Percentile'])
  df.reset_index(inplace=True)
  df.rename(columns={'index': 'data'}, inplace=True)
  df["Model"] = "KNN"
  knn_results = df.copy()


  # Concatenate all the DataFrames
  combined_df = pd.concat([results, logreg_results, rf_results, xgb_results, knn_results], axis=0)

  # Reset index to make it more structured
  combined_df.reset_index(inplace=True, drop=True)

  # Reorder columns to have "Title" first
  combined_df = combined_df[["Model"] + list(combined_df.columns[:-1])]

  lift_mean["Lift_Type"] = "Mean"
  lift_max["Lift_Type"] = "Max"
  lift_median["Lift_Type"] = "Median"
  lift_75th_percentile["Lift_Type"] = "75th Percentile"

  # Combine all the lift DataFrames
  df = pd.concat([lift_mean, lift_max, lift_median, lift_75th_percentile], axis=0)
  df['dim'] = df.index
  combined_lift_df = df[["Lift_Type", "dim","precision", "recall", "lift"]]

  # Define the path to save the CSV file
  path = "/content/gdrive/MyDrive/" + str(test_article_id) + "combined_results.csv"

  # Export the combined results
  combined_df.to_csv(path, index=False)

  # Append the lift results to the same CSV file
  combined_lift_df.to_csv(path, mode='a', index=False)
"""

'\ndef func(number):\n  import pandas as pd\n  import numpy as np\n  from numpy.linalg import norm\n  import random\n  import datetime as dt\n  from sklearn.metrics.pairwise import cosine_similarity\n  import itertools\n  from sklearn.metrics import precision_score, recall_score\n  import ast\n  from google.colab import drive\n\n  from sklearn.model_selection import train_test_split\n  from sklearn.svm import SVC\n  from sklearn.metrics import accuracy_score, classification_report\n\n\n  path = \'/content/gdrive/MyDrive/\'\n\n  drive.mount(\'/content/gdrive\')\n  test_article_id = number\n  number = str(number)+"_"\n  filtered_description_emb = pd.read_csv(path + number + \'filtered_description_emb.csv\', index_col=0)\n  filtered_description_emb2 = pd.read_csv(path + number + \'filtered_description_emb2.csv\', index_col=0)\n  #filtered_description_emb2_avg_vector = pd.read_csv(path + number + \'filtered_description_emb2_avg_vector.csv\', index_col=0)\n\n  filtered_images_df = pd.read_c

In [67]:
"""
for i in range (len(lst)):
  func(lst[i])
  print(i,lst[i])
"""

'\nfor i in range (len(lst)):\n  func(lst[i])\n  print(i,lst[i])\n'