# Case Study: Predictive Analytics for E-commerce

In [1]:
import random
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd

In [2]:
num_items = 50
categories = ['Electronics', 'Electronics', 'Clothing', 'Home & Kitchen', 'Beauty'] * 10

# Ensure the categories list is long enough
categories = (categories * ((num_items // len(categories)) + 1))[:num_items]

# Generate a range of dates and repeat it enough times to cover num_items
purchase_dates = [f"2023-01-{d:02d}" for d in range(1, 32)]
# Repeat dates to ensure enough entries
purchase_dates = (purchase_dates * ((num_items // len(purchase_dates)) + 1))[:num_items]
# Shuffle dates to simulate randomness
random.shuffle(purchase_dates)

data = {
    'customer_id': range(1, num_items + 1),
    'product_id': random.sample([prod for prod in range(100, 150)] + list(range(100, 110)), num_items),
    'purchase_date': purchase_dates,
    'category': random.choices(categories, k=num_items),  # Use random.choices with replacement
    'price': [random.randint(20, 1000) for _ in range(num_items)],
    'ratings': [round(random.uniform(3.5, 5.0), 1) for _ in range(num_items)],
    'page_views': [random.randint(10, 50) for _ in range(num_items)],
    'time_spent': [random.randint(60, 240) for _ in range(num_items)]
}

df = pd.DataFrame(data)

In [3]:
# df['purchase_date'] = pd.to_datetime(df['purchase_date'])

# def calculate_rfm(df):
#     # Current date
#     current_date = df['purchase_date'].max() + pd.Timedelta(days=1)
    
#     # RFM Calculation
#     rfm = df.groupby('customer_id').agg({
#         'purchase_date': lambda x: (current_date - x.max()).days,
#         'product_id': 'count',
#         'price': 'sum'
#     }).rename(columns={
#         'purchase_date': 'Recency',
#         'product_id': 'Frequency',
#         'price': 'Monetary'
#     }).reset_index()
    
#     return rfm

# def compute_cosine_similarity(rfm):
#     rfm_matrix = rfm[['Recency', 'Frequency', 'Monetary']]
#     cosine_sim = cosine_similarity(rfm_matrix)
#     cosine_sim_df = pd.DataFrame(cosine_sim, index=rfm['customer_id'], columns=rfm['customer_id'])
#     return cosine_sim_df

# def recommend_products(customer_id, df, cosine_sim_df, top_n=3):
#     similar_customers = cosine_sim_df[customer_id].sort_values(ascending=False).index[1:]
    
#     recommended_products = []
#     for similar_customer in similar_customers:
#         products = df[df['customer_id'] == similar_customer]['product_id'].values
#         recommended_products.extend(products)
    
#     recommended_products = list(set(recommended_products) - set(df[df['customer_id'] == customer_id]['product_id'].values))
#     return recommended_products[:top_n]

# # Evaluate Recommendations
# def evaluate_recommendations(df, customer_id, recommended_products):
#     actual_products = df[df['customer_id'] == customer_id]['product_id'].unique()
#     all_products = set(df['product_id'].unique())
#     actual_binary = [1 if product in actual_products else 0 for product in all_products]
#     recommended_binary = [1 if product in recommended_products else 0 for product in all_products]
#     precision = precision_score(actual_binary, recommended_binary)
#     recall = recall_score(actual_binary, recommended_binary)
#     f1 = f1_score(actual_binary, recommended_binary)
#     return precision, recall, f1

# # Evaluate System
# def evaluate_system(df, cosine_sim_df, top_n=3):
#     precision_list = []
#     recall_list = []
#     f1_list = []
#     customer_ids = df['customer_id'].unique()
#     for customer_id in customer_ids:
#         recommended_products = recommend_products(customer_id, df, cosine_sim_df, top_n)
#         precision, recall, f1 = evaluate_recommendations(df, customer_id, recommended_products)
#         precision_list.append(precision)
#         recall_list.append(recall)
#         f1_list.append(f1)
#     avg_precision = np.mean(precision_list)
#     avg_recall = np.mean(recall_list)
#     avg_f1 = np.mean(f1_list)
#     return avg_precision, avg_recall, avg_f1

# # Calculate RFM
# rfm = calculate_rfm(df)

# # Compute Cosine Similarity
# cosine_sim_df = compute_cosine_similarity(rfm)

# # Recommend products for a given customer
# customer_id = 40
# top_n = 5
# recommended_products = recommend_products(customer_id, df, cosine_sim_df, top_n)
# avg_precision, avg_recall, avg_f1 = evaluate_system(df, cosine_sim_df, top_n=3)

# print(f"Recommended products for customer {customer_id}: {recommended_products}")
# print(f"Average Precision: {avg_precision}")
# print(f"Average Recall: {avg_recall}")
# print(f"Average F1-Score: {avg_f1}")

Recommended products for customer 40: [129, 130, 131, 133, 134]
Average Precision: 0.0
Average Recall: 0.0
Average F1-Score: 0.0


In [10]:
df = pd.DataFrame(df)
df['purchase_date'] = pd.to_datetime(df['purchase_date'])

def calculate_rfm(df):
    # Current date
    current_date = df['purchase_date'].max() + pd.Timedelta(days=1)
    
    # RFM Calculation
    rfm = df.groupby('customer_id').agg({
        'purchase_date': lambda x: (current_date - x.max()).days,
        'product_id': 'count',
        'price': 'sum'
    }).rename(columns={
        'purchase_date': 'Recency',
        'product_id': 'Frequency',
        'price': 'Monetary'
    }).reset_index()
    
    return rfm

def compute_cosine_similarity(rfm):
    rfm_matrix = rfm[['Recency', 'Frequency', 'Monetary']]
    cosine_sim = cosine_similarity(rfm_matrix)
    cosine_sim_df = pd.DataFrame(cosine_sim, index=rfm['customer_id'], columns=rfm['customer_id'])
    return cosine_sim_df

def recommend_products(customer_id, df, cosine_sim_df, top_n=3):
    similar_customers = cosine_sim_df[customer_id].sort_values(ascending=False).index[1:]
    
    recommended_products = []
    for similar_customer in similar_customers:
        products = df[df['customer_id'] == similar_customer]['product_id'].values
        recommended_products.extend(products)
    
    recommended_products = list(set(recommended_products) - set(df[df['customer_id'] == customer_id]['product_id'].values))
    return recommended_products[:top_n]



# Calculate RFM
rfm = calculate_rfm(df)

# Compute Cosine Similarity
cosine_sim_df = compute_cosine_similarity(rfm)

# Recommend products for a given customer
customer_id = 1
top_n = 20
recommended_products = recommend_products(customer_id, df, cosine_sim_df, top_n)
print(f"Recommended products for customer {customer_id}: {recommended_products}")


Recommended products for customer 1: [128, 129, 130, 131, 133, 134, 135, 136, 137, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 100]


In [8]:
import numpy as np
from numpy.linalg import norm 

A = np.array([1,8])
B = np.array([9,2])

cos_sim = np.dot(A,B)/(norm(A)*norm(B))
print(f"The cosine similarity is: {round(cos_sim,2)}")

The cosine similarity is: 0.34


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

cos_sim = cosine_similarity(A.reshape(1, -1),B.reshape(1, -1))
print(f"The cosine similarity is: {cos_sim}")

The cosine similarity is: [[0.3363364]]
