In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers = pd.read_csv('/content/drive/MyDrive/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/Products.csv')
transactions = pd.read_csv('/content/drive/MyDrive/Transactions.csv')

In [3]:
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [4]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'ProductID': lambda x: ' '.join(x),  # Purchased products (as space-separated list)
    'TransactionID': 'count',  # Number of transactions
    'Category': lambda x: ' '.join(x)  # Categories purchased
}).reset_index()

In [5]:
customer_profile = customers[['CustomerID', 'Region']]
customer_features = customer_features.merge(customer_profile, on='CustomerID')

In [6]:
customer_features['combined_features'] = (
    customer_features['ProductID'] + ' ' +
    customer_features['Category'] + ' ' +
    customer_features['Region']
)

In [7]:
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(customer_features['combined_features'])

In [8]:
similarity_matrix = cosine_similarity(feature_vectors)

In [9]:
customer_ids = customer_features['CustomerID']
lookalike_results = {}

for i, customer_id in enumerate(customer_ids[:20]):  # First 20 customers
    similarity_scores = list(enumerate(similarity_matrix[i]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 excluding self
    lookalike_results[customer_id] = [
        (customer_ids[j], round(score, 3)) for j, score in sorted_scores
    ]


In [10]:
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_results.items()
])
lookalike_df.to_csv('Lookalike.csv', index=False)


In [11]:
lookalike_df.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0190, 0.523), (C0154, 0.464), (C0120, 0.455)]"
1,C0002,"[(C0109, 0.488), (C0134, 0.474), (C0008, 0.461)]"
2,C0003,"[(C0181, 0.576), (C0134, 0.5), (C0025, 0.48)]"
3,C0004,"[(C0065, 0.546), (C0182, 0.454), (C0075, 0.441)]"
4,C0005,"[(C0096, 0.533), (C0119, 0.402), (C0162, 0.394)]"
