In [47]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [3]:
# Load datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

In [4]:
customers.head(), products.head(), transactions.head()

(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  2024-04-25 07:38:55    

In [5]:
# Merge datasets
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')
data.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [44]:
# Feature engineering
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Price_y': 'mean',
    'Category': lambda x: x.mode()[0]
}).reset_index()

customer_features.head()


Unnamed: 0,CustomerID,TotalValue,TransactionID,Price_y,Category
0,C0001,3354.52,5,278.334,Electronics
1,C0002,1862.74,4,208.92,Clothing
2,C0003,2725.38,4,195.7075,Home Decor
3,C0004,5354.88,8,240.63625,Books
4,C0005,2034.24,3,291.603333,Electronics


In [48]:
# Normalize features
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features[['TotalValue', 'TransactionID', 'Price_y']])

In [49]:
# Calculate similarity
similarity_matrix = 1 / (1 + euclidean_distances(customer_features_scaled))

In [50]:
# Function to get top 3 lookalikes
def get_top_3_lookalikes(customer_id, similarity_matrix, customer_ids):
    idx = customer_ids.index(customer_id)
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = similarity_scores[1:4]  # Exclude the customer itself
    return [(customer_ids[i], score) for i, score in top_3]

In [51]:
# Get top 3 lookalikes for the first 20 customers
customer_ids = customer_features['CustomerID'].tolist()
lookalikes = {customer_id: get_top_3_lookalikes(customer_id, similarity_matrix, customer_ids) for customer_id in customer_ids[:20]}


In [52]:
# Save results to CSV
lookalikes_df = pd.DataFrame.from_dict(lookalikes, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])
lookalikes_df.to_csv('Lookalike.csv', index_label='CustomerID')


In [53]:
model_data = pd.read_csv('/content/Lookalike.csv')
model_data.head()

Unnamed: 0,CustomerID,Lookalike1,Lookalike2,Lookalike3
0,C0001,"('C0164', 0.8719466535767227)","('C0137', 0.8705452914002111)","('C0152', 0.859279969149605)"
1,C0002,"('C0029', 0.95699076813147)","('C0142', 0.7641310123148944)","('C0031', 0.725816695181979)"
2,C0003,"('C0142', 0.7880098760631638)","('C0038', 0.6840122342448944)","('C0176', 0.6645243660121166)"
3,C0004,"('C0162', 0.7277670304901154)","('C0017', 0.7183266248538716)","('C0068', 0.714262923584333)"
4,C0005,"('C0061', 0.9457882107477658)","('C0167', 0.8655523049116419)","('C0120', 0.7962002771179394)"


In [54]:
true_labels = [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]  # Example true labels
predicted_labels = [1 if score > 0.5 else 0 for _, score in lookalikes_df['Lookalike1']]

In [55]:
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)
accuracy = accuracy_score(true_labels, predicted_labels)

In [56]:
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Accuracy: {accuracy}')


Precision: 0.55
Recall: 1.0
F1 Score: 0.7096774193548387
Accuracy: 0.55


In [57]:
# Quality of recommendations and similarity scores
lookalikes_dict = {customer_id: get_top_3_lookalikes(customer_id, similarity_matrix, customer_ids) for customer_id in customer_ids[:20]} # Recreating the lookalikes dictionary

for customer_id, lookalikes_list in lookalikes_dict.items(): # Using lookalikes_dict in the loop
    print(f'Customer {customer_id} lookalikes:')
    for lookalike_id, score in lookalikes_list:
        print(f'  Lookalike: {lookalike_id}, Similarity Score: {score}')

Customer C0001 lookalikes:
  Lookalike: C0164, Similarity Score: 0.8719466535767227
  Lookalike: C0137, Similarity Score: 0.8705452914002111
  Lookalike: C0152, Similarity Score: 0.859279969149605
Customer C0002 lookalikes:
  Lookalike: C0029, Similarity Score: 0.95699076813147
  Lookalike: C0142, Similarity Score: 0.7641310123148944
  Lookalike: C0031, Similarity Score: 0.725816695181979
Customer C0003 lookalikes:
  Lookalike: C0142, Similarity Score: 0.7880098760631638
  Lookalike: C0038, Similarity Score: 0.6840122342448944
  Lookalike: C0176, Similarity Score: 0.6645243660121166
Customer C0004 lookalikes:
  Lookalike: C0162, Similarity Score: 0.7277670304901154
  Lookalike: C0017, Similarity Score: 0.7183266248538716
  Lookalike: C0068, Similarity Score: 0.714262923584333
Customer C0005 lookalikes:
  Lookalike: C0061, Similarity Score: 0.9457882107477658
  Lookalike: C0167, Similarity Score: 0.8655523049116419
  Lookalike: C0120, Similarity Score: 0.7962002771179394
Customer C0006 