# Task 2: Lookalike Model

In [24]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

#  Load customer and transaction data from CSV files
df1 = pd.read_csv('Customers.csv')  # Contains customer details like ID, name, region, and signup date
df2 = pd.read_csv('Transactions.csv')  # Contains transaction details like transaction ID, product purchased, quantity, and value

# Merge the customer and transaction datasets using CustomerID as the common key
merged_df = pd.merge(df1, df2, on='CustomerID')

# Feature Engineering - We create aggregated features for each customer
customer_profile = merged_df.groupby('CustomerID').agg({
    'Region': 'first',  # Get the customer's region (assumes same region for each customer)
    'SignupDate': 'first',  # Keep the customer's signup date
    'ProductID': lambda x: ','.join(x.astype(str)),  # Concatenate all products purchased by the customer
    'Quantity': 'sum',  # Sum up the total quantity purchased by the customer
    'TotalValue': 'sum'  # Sum up the total transaction value for each customer
}).reset_index()

# Encode categorical variables (Region and ProductID)
# First, we handle the 'Region' column by one-hot encoding it
encoder = OneHotEncoder()
encoded_region = encoder.fit_transform(customer_profile[['Region']]).toarray()

# Then, we encode the 'ProductID' column (products purchased by the customer) in the same way
encoded_products = encoder.fit_transform(customer_profile[['ProductID']]).toarray()

# Normalize the numerical features (Quantity and TotalValue) so they're on the same scale
scaler = MinMaxScaler()
normalized_values = scaler.fit_transform(customer_profile[['Quantity', 'TotalValue']])

# Combine all the features into one final feature matrix
# We concatenate the one-hot encoded regions, product data, and normalized numerical values
features = pd.concat([
    pd.DataFrame(encoded_region, index=customer_profile.index),
    pd.DataFrame(encoded_products, index=customer_profile.index),
    pd.DataFrame(normalized_values, index=customer_profile.index)
], axis=1)

# Calculate cosine similarity between all customers based on the feature matrix
# Cosine similarity measures the similarity between two vectors, with 1 meaning exactly the same and 0 meaning no similarity
similarity_matrix = cosine_similarity(features)

# Find the top 3 most similar customers for each target customer
# We focus on customers with IDs in the range 'C0001' to 'C0020'
customer_ids = customer_profile['CustomerID']
lookalike_results = {}

for idx, customer_id in enumerate(customer_ids):
    if customer_id in [f'C{str(i).zfill(4)}' for i in range(1, 21)]:  # We're only interested in customers C0001 to C0020
        # Get the similarity scores for the current customer
        similarity_scores = list(enumerate(similarity_matrix[idx]))
        # Exclude the current customer from the similarity list and sort by similarity score (descending)
        similarity_scores = sorted(
            [(customer_ids[other_idx], score) for other_idx, score in similarity_scores if other_idx != idx],
            key=lambda x: x[1], reverse=True
        )
        # Select the top 3 most similar customers
        top_3 = similarity_scores[:3]
        lookalike_results[customer_id] = top_3

# Save the results to a CSV file called 'Lookalike.csv'
# The output includes the original customer ID and their top 3 similar customers with similarity scores
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_results.keys(),
    'Lookalikes': [[(cust_id, round(score, 4)) for cust_id, score in values] for values in lookalike_results.values()]
})

lookalike_df.to_csv('Lookalike.csv', index=False)

# Inform the user that the process is complete and the results have been saved
print("Lookalike model results saved to 'Gaurav_Pradhan_Lookalike.csv'")


Lookalike model results saved to 'Gaurav_Pradhan_Lookalike.csv'


In [20]:
# Create custom encoders for 'Region' and 'ProductID'
unique_regions = customer_profile['Region'].unique()
region_encoder = {region: idx for idx, region in enumerate(unique_regions)}

unique_products = merged_df['ProductID'].unique()
product_encoder = {product: idx for idx, product in enumerate(unique_products)}

# Function to encode features
def encode_features(row, region_encoder, product_encoder):
    # Encode region
    region_vector = [0] * len(region_encoder)
    if row['Region'] in region_encoder:
        region_vector[region_encoder[row['Region']]] = 1
    
    # Encode products
    product_ids = row['ProductID'].split(',')
    product_vector = [0] * len(product_encoder)
    for product in product_ids:
        if product in product_encoder:
            product_vector[product_encoder[product]] = 1
    
    return region_vector + product_vector

# Encode all customer profiles
encoded_profiles = customer_profile.apply(
    lambda row: encode_features(row, region_encoder, product_encoder), axis=1
)
encoded_profiles = pd.DataFrame(encoded_profiles.tolist())

# Normalize numerical features
scaler = MinMaxScaler()
normalized_values = scaler.fit_transform(customer_profile[['Quantity', 'TotalValue']])
encoded_profiles = pd.concat(
    [encoded_profiles, pd.DataFrame(normalized_values, index=customer_profile.index)], axis=1
)

# Function to recommend similar customers
def recommend_similar_customers(user_data):
    # Encode user region
    region_vector = [0] * len(region_encoder)
    if user_data['Region'] in region_encoder:
        region_vector[region_encoder[user_data['Region']]] = 1

    # Encode user products
    user_products = user_data['ProductID']
    product_vector = [0] * len(product_encoder)
    for product in user_products:
        if product in product_encoder:
            product_vector[product_encoder[product]] = 1

    # Combine all user features
    user_normalized_values = scaler.transform([[user_data['Quantity'], user_data['TotalValue']]])
    user_features = region_vector + product_vector + user_normalized_values.flatten().tolist()

    # Compute similarity scores
    similarity_matrix = cosine_similarity(encoded_profiles, [user_features])

    # Get top 3 similar customers
    top_3 = sorted(
        list(enumerate(similarity_matrix.flatten())), key=lambda x: x[1], reverse=True
    )[:3]

    # Map customer indices to IDs and scores
    top_3_customers = [(customer_profile.iloc[idx]['CustomerID'], round(score, 4)) for idx, score in top_3]

    return top_3_customers

# Main script: Take user input
if __name__ == "__main__":
    print("Welcome to the Lookalike Model!")
    region = input("Enter your region: ").strip()
    product_ids = input("Enter purchased product IDs (comma-separated): ").strip().split(',')
    quantity = float(input("Enter total quantity purchased: "))
    total_value = float(input("Enter total transaction value: "))

    # Create user data dictionary
    user_data = {
        "Region": region,
        "ProductID": product_ids,
        "Quantity": quantity,
        "TotalValue": total_value
    }

    # Get recommendations
    recommendations = recommend_similar_customers(user_data)

    # Display recommendations
    print("\nTop 3 similar customers:")
    for cust_id, score in recommendations:
        print(f"CustomerID: {cust_id}, Similarity Score: {score}")


Welcome to the Lookalike Model!
Enter your region: South
Enter purchased product IDs (comma-separated): P011
Enter total quantity purchased: 4
Enter total transaction value: 300

Top 3 similar customers:
CustomerID: C0134, Similarity Score: 0.4172
CustomerID: C0148, Similarity Score: 0.4133
CustomerID: C0119, Similarity Score: 0.4122


