In [1]:
#Task 2: Lookalike Model

import pandas as pd  # Importing the pandas library for data manipulation
from sklearn.metrics.pairwise import cosine_similarity  # Importing cosine similarity for calculating similarity
from sklearn.preprocessing import StandardScaler  # Importing StandardScaler for feature scaling

# Specify the path to your Downloads folder
downloads_path = r'C:\Users\amank\Downloads'  # Update this path as necessary

# Load datasets from the Downloads folder
customers = pd.read_csv(downloads_path + '\\Customers.csv')  # Load customer data
products = pd.read_csv(downloads_path + '\\Products.csv')  # Load product data
transactions = pd.read_csv(downloads_path + '\\Transactions.csv')  # Load transaction data

# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Debugging: Check the columns of the merged DataFrame
print("Merged DataFrame columns:", merged_data.columns)

# Prepare features for lookalike model
features = merged_data.groupby(['CustomerID']).agg({
    'Quantity': 'sum',  # Total quantity purchased by the customer
    'TotalValue': 'sum',  # Total value of transactions for the customer
    'Price_y': 'mean',  # Use 'Price_y' instead of 'Price'
    'Region': 'first'  # Keep the region information
}).reset_index()

# One-hot encode the 'Region' column to include it in the feature set
features = pd.get_dummies(features, columns=['Region'], drop_first=True)

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features.drop(columns=['CustomerID']))  # Exclude CustomerID for scaling

# Calculate similarity matrix
similarity_matrix = cosine_similarity(scaled_features)  # Compute the cosine similarity matrix

# Prepare to store lookalike results
lookalikes = {}  # Initialize a dictionary to store lookalikes

# Get top 3 lookalikes for the first 20 customers (C0001 to C0020)
for i in range(20):
    similar_indices = similarity_matrix[i].argsort()[-4:-1][::-1]  # Get indices of the top 3 similar customers (excluding self)
    lookalikes[features['CustomerID'][i]] = [(features['CustomerID'][j], similarity_matrix[i][j]) for j in similar_indices]

# Prepare data for DataFrame
lookalike_records = []  # List to hold records for DataFrame
for customer_id, similar_customers in lookalikes.items():
    for similar_customer in similar_customers:
        lookalike_records.append({
            'CustomerID': customer_id,
            'LookalikeID': similar_customer[0],
            'SimilarityScore': similar_customer[1]
        })

# Convert the list of records to a DataFrame
lookalike_df = pd.DataFrame(lookalike_records)

# Save the DataFrame to a CSV file
lookalike_df.to_csv(downloads_path + '\\Lookalike.csv', header=True, index=False)  # Save the lookalike results

# Print the lookalike results for verification
print(lookalike_df)

Merged DataFrame columns: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')
   CustomerID LookalikeID  SimilarityScore
0       C0001       C0137         0.992318
1       C0001       C0191         0.989105
2       C0001       C0011         0.982062
3       C0002       C0088         0.984892
4       C0002       C0142         0.978721
5       C0002       C0043         0.963602
6       C0003       C0190         0.986157
7       C0003       C0147         0.970992
8       C0003       C0174         0.959645
9       C0004       C0113         0.986737
10      C0004       C0165         0.968794
11      C0004       C0012         0.963296
12      C0005       C0140         0.989953
13      C0005       C0123         0.977757
14      C0005       C0186         0.976210
15      C0006       C0048         0.983537
16      C0006       C