In [20]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [21]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [22]:
# Merge datasets to create a unified dataset
transactions = transactions.merge(customers, on='CustomerID')
transactions = transactions.merge(products, on='ProductID')

In [23]:
# Preprocessing: Create customer-level features
customer_features = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',  # Total quantity purchased
    'ProductID': 'count',  # Number of unique transactions
    'Category': lambda x: x.mode()[0]  # Most frequent category
}).reset_index()

# Rename 'ProductID' count to 'TransactionCount' for clarity
customer_features.rename(columns={'ProductID': 'TransactionCount'}, inplace=True)

# Encode categorical features (Category) using one-hot encoding
customer_features = pd.get_dummies(customer_features, columns=['Category'], drop_first=True)

# Normalize numerical features for similarity calculation
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

# Compute cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(normalized_features)

# Get the top 3 similar customers for each of the first 20 customers
lookalike_data = {}
for idx, customer_id in enumerate(customer_features['CustomerID'][:20]):
    similar_indices = similarity_matrix[idx].argsort()[-4:-1][::-1]  # Top 3 excluding the customer itself
    similar_customers = [
        (customer_features.iloc[i]['CustomerID'], similarity_matrix[idx][i])
        for i in similar_indices]
    lookalike_data[customer_id] = similar_customers

# Convert the lookalike data into a DataFrame for export
lookalike_list = [
    {'cust_id': customer, 'lookalike': lookalike, 'score': score}
    for customer, lookalikes in lookalike_data.items()
    for lookalike, score in lookalikes]
lookalike_df = pd.DataFrame(lookalike_list)

# Save the Lookalike Data to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the lookalike map for verification
print(lookalike_data)

{'C0001': [('C0072', 0.9920353262922637), ('C0055', 0.9909776607327331), ('C0048', 0.9908425603270468)], 'C0002': [('C0029', 0.9998566015358691), ('C0010', 0.9890824589674211), ('C0088', 0.977761607066873)], 'C0003': [('C0160', 0.9765756505857475), ('C0086', 0.9749660452024961), ('C0038', 0.974960216277944)], 'C0004': [('C0075', 0.9930751055565259), ('C0017', 0.9904447082049586), ('C0175', 0.9880458606182625)], 'C0005': [('C0186', 0.9984740104182287), ('C0112', 0.9978769608402586), ('C0007', 0.9918529693356967)], 'C0006': [('C0117', 0.9956446719907337), ('C0168', 0.9508722973745576), ('C0064', 0.8977845048442423)], 'C0007': [('C0120', 0.9974073348127978), ('C0050', 0.9937244109197855), ('C0005', 0.9918529693356967)], 'C0008': [('C0113', 0.9539839133855204), ('C0124', 0.9253194213959769), ('C0012', 0.9019799868058399)], 'C0009': [('C0083', 0.9975450047948013), ('C0198', 0.9906799609117416), ('C0077', 0.9791575685462162)], 'C0010': [('C0029', 0.9896891999847746), ('C0002', 0.989082458967

In [24]:
print(customer_features.columns)

Index(['CustomerID', 'TotalValue', 'Quantity', 'TransactionCount',
       'Category_Clothing', 'Category_Electronics', 'Category_Home Decor'],
      dtype='object')


In [25]:
# Encode categorical features (Category) using one-hot encoding
customer_features = pd.get_dummies(customer_features, columns=['Category_Electronics','Category_Home Decor'], drop_first=True)


In [26]:
# Normalize numerical features for similarity calculation
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))


In [27]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(normalized_features)

In [28]:
# Get the top 3 similar customers for each of the first 20 customers
lookalike_data = {}
for idx, customer_id in enumerate(customer_features['CustomerID'][:20]):
    similar_indices = similarity_matrix[idx].argsort()[-4:-1][::-1]  # Top 3 excluding the customer itself
    similar_customers = [
        (customer_features.iloc[i]['CustomerID'], similarity_matrix[idx][i])
        for i in similar_indices
    ]
    lookalike_data[customer_id] = similar_customers

In [29]:
# Convert the lookalike data into a DataFrame for export
lookalike_list = [
    {'cust_id': customer, 'lookalikes': lookalike, 'score': score}
    for customer, lookalikes in lookalike_data.items()
    for lookalike, score in lookalikes
]

lookalike_df = pd.DataFrame(lookalike_list)

In [30]:
# Save the Lookalike Data to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

In [31]:
# Display the lookalike map for verification
print(lookalike_data)

{'C0001': [('C0072', 0.9920353262922637), ('C0055', 0.9909776607327331), ('C0048', 0.9908425603270468)], 'C0002': [('C0029', 0.9998566015358691), ('C0010', 0.9890824589674211), ('C0088', 0.977761607066873)], 'C0003': [('C0160', 0.9765756505857475), ('C0086', 0.9749660452024961), ('C0038', 0.974960216277944)], 'C0004': [('C0075', 0.9930751055565259), ('C0017', 0.9904447082049586), ('C0175', 0.9880458606182625)], 'C0005': [('C0186', 0.9984740104182287), ('C0112', 0.9978769608402586), ('C0007', 0.9918529693356967)], 'C0006': [('C0117', 0.9956446719907337), ('C0168', 0.9508722973745576), ('C0064', 0.8977845048442423)], 'C0007': [('C0120', 0.9974073348127978), ('C0050', 0.9937244109197855), ('C0005', 0.9918529693356967)], 'C0008': [('C0113', 0.9539839133855204), ('C0124', 0.9253194213959769), ('C0012', 0.9019799868058399)], 'C0009': [('C0083', 0.9975450047948013), ('C0198', 0.9906799609117416), ('C0077', 0.9791575685462162)], 'C0010': [('C0029', 0.9896891999847746), ('C0002', 0.989082458967

In [32]:
from sklearn.metrics import accuracy_score

# Assuming y_true and y_pred are lists of customer IDs representing actual vs recommended customers
# Example:
y_true = ['C0005', 'C0032', 'C0010']  # Actual similar customers for C0001
y_pred = ['C0005', 'C0010', 'C0032']  # Recommended similar customers for C0001

# Compute accuracy (percentage of correct recommendations)
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 33.33%


In [33]:
# Let's assume 'lookalike_dict' is a dictionary with actual vs predicted lookalikes for each customer
lookalike_dict = {
    'C0001': (['C0005', 'C0032', 'C0010'], ['C0005', 'C0010', 'C0032']),
    'C0002': (['C0009', 'C0045', 'C0012'], ['C0009', 'C0045', 'C0013']),
    # Add more customers as needed
}

# Initialize an accuracy list
accuracies = []

# Loop through the customers
for cust_id, (y_true, y_pred) in lookalike_dict.items():
    accuracy = accuracy_score(y_true, y_pred)
    accuracies.append(accuracy)

# Compute overall accuracy
overall_accuracy = sum(accuracies) / len(accuracies)
print(f'Overall Accuracy: {overall_accuracy * 100:.2f}%')


Overall Accuracy: 50.00%
