In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('/content/Products (1).csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Check and compute 'Price' if missing
if 'Price' not in merged_data.columns:
    merged_data['Price'] = merged_data['TotalValue'] / merged_data['Quantity']

# Aggregate data to create customer profiles
customer_profiles = merged_data.groupby('CustomerID').agg({
    'Region': lambda x: ' '.join(x.unique()),
    'Category': lambda x: ' '.join(x.unique()),
    'Price': 'mean',
    'Quantity': 'sum',
    'TotalValue': 'sum'
}).reset_index()


# Combine text features for TF-IDF
customer_profiles['CombinedFeatures'] = customer_profiles['Region'] + ' ' + customer_profiles['Category']

# Apply TF-IDF to combined features
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(customer_profiles['CombinedFeatures'])

# Compute similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

# Generate lookalike recommendations
lookalike_results = {}
customer_ids = customer_profiles['CustomerID']

for idx, customer_id in enumerate(customer_ids):
    # Get similarity scores for the customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))

    # Sort by similarity score in descending order
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Exclude self and get top 3 similar customers
    top_3 = [(customer_ids[i], score) for i, score in sorted_scores if i != idx][:3]

    # Save results
    lookalike_results[customer_id] = top_3

# Filter results for the first 20 customers
lookalike_filtered = {cid: lookalike_results[cid] for cid in customer_ids[:20]}

# Save results to CSV
lookalike_df = pd.DataFrame([
    {
        'CustomerID': cid,
        'Lookalikes': str([(lc[0], round(lc[1], 4)) for lc in lookalikes])
    }
    for cid, lookalikes in lookalike_filtered.items()
])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike model completed. Recommendations saved to Lookalike.csv.")
