<a href="https://colab.research.google.com/github/gowthamich35/DataScienceZeotap/blob/main/Chunchu_Gowthami_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler  # Ensure this is imported

# Load the data
cs = pd.read_csv('Customers.csv')
ps = pd.read_csv('Products.csv')
ts = pd.read_csv('Transactions.csv')

# Step 1: Merge data
# Merge transactions with customer data
merged_data = pd.merge(ts, cs, on='CustomerID', how='inner')

# Step 2: Feature Engineering
# Create features like transaction frequency, total spent, most frequent product categories, etc.
customer_profile = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    most_frequent_category=('ProductID', lambda x: x.mode()[0])
).reset_index()

# Step 3: Merge product categories with the transactions data
# Ensure that 'Category' column exists in the products data (ps)
product_categories = ps[['ProductID', 'Category']]

# Merge the product categories with the merged_data (transactions)
merged_data = pd.merge(merged_data, product_categories, on='ProductID', how='left')

# Calculate customer engagement with different categories
category_engagement = merged_data.groupby(['CustomerID', 'Category']).size().unstack().fillna(0)

# Combine all features into a single customer profile
customer_profile = pd.merge(customer_profile, category_engagement, on='CustomerID', how='left')

# Step 4: Prepare data for scaling (only numerical columns)
# Drop non-numeric columns (e.g., 'CustomerID' and 'most_frequent_category')
numerical_data = customer_profile.drop(['CustomerID', 'most_frequent_category'], axis=1)

scaler = StandardScaler()
normalized_features = scaler.fit_transform(numerical_data)

# Step 5: Calculate Similarity Scores
# Calculate cosine similarity between customers
cos_sim = cosine_similarity(normalized_features)

# Step 6: Generate Recommendations for the first 20 customers
lookalike_dict = {}

for i in range(20):  # For CustomerID C0001 to C0020
    # Get similarity scores for customer i
    similarity_scores = cos_sim[i]
    # Exclude the customer from their own lookalikes
    similarity_scores[i] = -1
    # Get the indices of the top 3 most similar customers
    top_3_indices = similarity_scores.argsort()[-3:][::-1]

    # Create a list of the top 3 lookalike customers and their similarity scores
    top_3_lookalikes = [(cs.iloc[idx]['CustomerID'], similarity_scores[idx]) for idx in top_3_indices]
    lookalike_dict[cs.iloc[i]['CustomerID']] = top_3_lookalikes

# Step 7: Prepare the DataFrame correctly with the appropriate columns
lookalike_data = []

for customer_id, lookalikes in lookalike_dict.items():
    for lookalike in lookalikes:
        lookalike_data.append([customer_id, lookalike[0], lookalike[1]])

# Create the DataFrame
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'Lookalike CustomerID', 'Similarity Score'])

# Step 8: Output to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv has been created.")


Lookalike.csv has been created.
