<a href="https://colab.research.google.com/github/dean-daryl/multi-modal-auth-prediction/blob/main/models/Product_Recommendation_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This script demonstrates how to build a product recommendation model using a dataset named 'merged_data.csv'. It uses product features to calculate similarity between products.

Mount dataset from drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import Required Libraries

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

Retrieve and Load Dataset

In [4]:
merged_data = '/content/drive/MyDrive/data/merged_data.csv'
data = pd.read_csv(merged_data)

# Quick check on the dataset
print("Dataset shape:", data.shape)
print("Columns available:", data.columns.tolist())
print(data.head())

Dataset shape: (295, 24)
Columns available: ['customer_id_new', 'engagement_score', 'purchase_interest_score', 'customer_id_legacy', 'transaction_id', 'purchase_amount', 'purchase_date', 'customer_rating', 'product_category_Books', 'product_category_Clothing', 'product_category_Electronics', 'product_category_Groceries', 'product_category_Sports', 'product_category_Unknown', 'review_sentiment_Negative', 'review_sentiment_Neutral', 'review_sentiment_Positive', 'review_sentiment_Unknown', 'social_media_platform_Facebook', 'social_media_platform_Instagram', 'social_media_platform_LinkedIn', 'social_media_platform_TikTok', 'social_media_platform_Twitter', 'social_media_platform_Unknown']
  customer_id_new  engagement_score  purchase_interest_score  \
0             100              73.0                      4.4   
1             100              73.0                      4.4   
2             100              81.0                      4.4   
3             100              81.0                

Data Preprocessing

In [20]:
# 1. Handle missing values
# We will fill missing categorical values with 'Unknown'
# and numerical values with the median.
product_name = data['product_category_Books']
category = data['product_category_Clothing']
for col in data.columns:
    if data[col].dtype == 'object':  # categorical
        data[col] = data[col].fillna('Unknown')
    else:  # numerical
        data[col] = data[col].fillna(data[col].median())

# 2. Normalize numerical columns (e.g., price, rating, etc.)
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
scaler = MinMaxScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# 3. Combine text-based features for TF-IDF
# If the dataset has product descriptions, categories, etc., combine them.
# Example: combine 'product_category_Books' and 'product_category_Clothing' into a single string.
text_features = data[['product_category_Books', 'product_category_Clothing']].astype(str)
data['combined_text'] = text_features.apply(lambda x: ' '.join(x), axis=1)

Build Product Similarity Matrix

In [21]:
# Convert text features into a numerical representation (TF-IDF)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['combined_text'])

# Compute cosine similarity between products
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

Recommendation Function

In [22]:
def recommend_products(product_name, top_n=5):
    """
    Recommend top N products similar to the given product_name.

    Args:
        product_name (str): Name of the product to find similar items for.
        top_n (int): Number of recommended products to return.

    Returns:
        recommendations (DataFrame): Top N similar products with similarity scores.
    """
    # Check if the product exists
    if product_name not in data['product_name'].values:
        print(f"Product '{product_name}' not found in the dataset.")
        return pd.DataFrame()

    # Get index of the product
    idx = data[data['product_name'] == product_name].index[0]

    # Get similarity scores for this product
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort products by similarity (highest first), excluding the product itself
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1: top_n + 1]

    # Get product indices
    product_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]

    # Return top N similar products
    recommendations = data.iloc[product_indices][['product_name', 'category']].copy()
    recommendations['similarity_score'] = scores
    return recommendations

Test the Model

In [31]:
# For instance; Recommend similar products to a given product
def recommend_products (product_name, top_n=5):
 sample = data['product_category_Books'].iloc[0]
print(f"\nRecommendations for product: {sample}")
print('recommend_products'(sample, top_n=5))



NameError: name 'sample' is not defined