<a href="https://colab.research.google.com/github/dchaithanya123/ZeoTap/blob/main/Dinnipati_Chaithanya_Kumar_Reddy_Lookalike_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Load datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')

# Clean up column names by stripping leading/trailing spaces
customers.columns = customers.columns.str.strip()
products.columns = products.columns.str.strip()


# Inspect the column names of datasets
print("Customers Dataset Columns:", customers.columns)
print("Products Dataset Columns:", products.columns)


# Ensure the necessary columns exist
if 'CustomerID' not in customers.columns:
    raise ValueError("'CustomerID' column is missing in the customers dataset.")
if 'ProductID' not in products.columns:
    raise ValueError("'ProductID' column is missing in the products dataset.")

# Feature Engineering: Select relevant features for recommendation
# We'll assume customers have data like Region, and products have data like Category and Price.
# In a real scenario, we'd link customers and products based on their interactions, but here we'll treat them as separate features.

# We'll merge customer information with some product categories for the recommendation
merged_data = pd.merge(customers, products, how='cross')  # Cartesian join to combine customer profiles with all products

# Now, we can create a set of features for the recommendation system:
categorical_features = ['Region', 'Category']  # 'Region' from customers and 'Category' from products
numerical_features = ['Price']  # 'Price' from products

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(), categorical_features)
])

# Apply preprocessing to the dataset
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
data_transformed = pipeline.fit_transform(merged_data)

# Compute similarity matrix
similarity_matrix = cosine_similarity(data_transformed)

# Check available customer IDs in the dataset
available_customer_ids = customers['CustomerID'].values
print("Available Customer IDs:", available_customer_ids[:20])  # Display first 20 for inspection

# Function to recommend top N similar customers for each customer
def recommend_similar(customers_data, customer_id, top_n=3):
    # Ensure customer_id exists in the dataset
    if customer_id not in customers_data['CustomerID'].values:
        raise ValueError(f"CustomerID {customer_id} not found in the dataset.")

    # Find the index of the customer_id in the customers dataset
    customer_index = customers_data[customers_data['CustomerID'] == customer_id].index[0]

    # Get the indices of the corresponding rows in the merged_data (cross join results)
    customer_rows = merged_data[merged_data['CustomerID'] == customer_id].index.tolist()

    # Calculate similarity scores for each row corresponding to the customer
    similarity_scores = similarity_matrix[customer_rows[0]]  # Start with the first row of the customer

    # Get the top N most similar customers (excluding the customer itself)
    similar_customers = np.argsort(similarity_scores)[::-1][1:top_n + 1]

    recommendations = [
        {
            'CustomerID': merged_data.iloc[i]['CustomerID'],
            'SimilarityScore': similarity_scores[i]
        }
        for i in similar_customers
    ]

    return recommendations

# Generate recommendations for the first 20 customers
lookalikes = {}
for i in range(1, 21):  # For CustomerID C0001 to C0020
    customer_id = f'C{i:04d}'
    try:
        recommendations = recommend_similar(customers, customer_id, top_n=3)
        lookalikes[customer_id] = [(rec['CustomerID'], rec['SimilarityScore']) for rec in recommendations]
    except ValueError as e:
        print(e)

# Convert the lookalike results into a DataFrame for export
lookalike_df = pd.DataFrame([
    {'CustomerID': customer_id, 'Lookalikes': lookalikes[customer_id]}
    for customer_id in lookalikes
])

# Save the lookalike results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Output some results
print(lookalike_df.head())


Customers Dataset Columns: Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate'], dtype='object')
Products Dataset Columns: Index(['ProductID', 'ProductName', 'Category', 'Price'], dtype='object')
Available Customer IDs: ['C0001' 'C0002' 'C0003' 'C0004' 'C0005' 'C0006' 'C0007' 'C0008' 'C0009'
 'C0010' 'C0011' 'C0012' 'C0013' 'C0014' 'C0015' 'C0016' 'C0017' 'C0018'
 'C0019' 'C0020']
  CustomerID                                         Lookalikes
0      C0001  [(C0099, 1.0000000000000002), (C0091, 1.000000...
1      C0002  [(C0005, 1.0000000000000002), (C0084, 1.000000...
2      C0003  [(C0099, 1.0000000000000002), (C0091, 1.000000...
3      C0004  [(C0099, 1.0000000000000002), (C0091, 1.000000...
4      C0005  [(C0005, 1.0000000000000002), (C0084, 1.000000...
