#Task 2: Lookalike Model

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


##Merge Datasets:

Merge Customers.csv, Products.csv, and Transactions.csv to create a consolidated dataset.
This allows us to use both profile and transaction data for similarity calculations.

In [None]:
merged = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')

##Feature Engineering:

Extract meaningful features for each customer, such as:
* Total spending.
* Number of unique product categories purchased.
* Average price of products purchased.
* Total quantity purchased.
* Region (encoded numerically).

In [None]:
customer_features = merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',              # Total spending
    'ProductID': 'nunique',           # Number of unique products purchased
    'Category': 'nunique',            # Number of unique categories purchased
    'Quantity': 'sum',                # Total quantity purchased
    'Region': 'first'                 # Region
}).reset_index()

##Normalize Features:

Scale numerical features using techniques like MinMaxScaler or StandardScaler to ensure all features contribute equally to similarity calculation.


In [None]:
customer_features['Region'] = customer_features['Region'].astype('category').cat.codes
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

##Defining a Similarity Metric:

Use Cosine Similarity, which measures how similar two vectors are regardless of magnitude.

In [None]:
similarity_matrix = cosine_similarity(normalized_features)

# Find top 3 similar customers for the first 20 customers
lookalike_results = {}
for i in range(20):
    # Get similarity scores for CustomerID C0001 - C0020
    scores = list(enumerate(similarity_matrix[i]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self (index 0)
    customer_id = customer_features.iloc[i]['CustomerID']
    lookalike_results[customer_id] = [(customer_features.iloc[idx]['CustomerID'], score) for idx, score in scores]

## Save results to Lookalike.csv
Create a dictionary (or DataFrame) with CustomerID as the key and a list of top 3 similar customers with their scores as the value.
Save the results to a file named Lookalike.csv

In [None]:
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_results.keys()),
    'Recommendations': [str(rec) for rec in lookalike_results.values()]
})
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv has been created!")

Lookalike.csv has been created!
