In [33]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

merged_data = pd.merge(customer_data, purchase_history, on='CustomerID')
merged_data = pd.merge(merged_data, product_data, on='ProductID')

feature_data = merged_data.groupby('CustomerID').agg({
    'Price': 'sum',
    'ProductID': 'count',
    'Category': lambda x: ','.join(x)
}).reset_index()

feature_data['Category'] = feature_data['Category'].apply(lambda x: x.split(','))
feature_data = feature_data.explode('Category')
feature_data = pd.get_dummies(feature_data, columns=['Category'])
scaler = StandardScaler()
feature_data[['Price', 'ProductID']] = scaler.fit_transform(feature_data[['Price', 'ProductID']])

similarity_matrix = cosine_similarity(feature_data.drop_duplicates('CustomerID').drop('CustomerID', axis=1))

similarity_df = pd.DataFrame(similarity_matrix, index=feature_data['CustomerID'].unique(), columns=feature_data['CustomerID'].unique())

lookalike_map = {}

for customer_id in similarity_df.index[:20]:
    user_index = similarity_df.index.get_loc(customer_id)
    similar_customers = similarity_df.iloc[user_index].sort_values(ascending=False).index[1:4]
    similarity_scores = similarity_df.iloc[user_index].sort_values(ascending=False).values[1:4]
    lookalike_map[customer_id] = [(similar_customers[i], similarity_scores[i]) for i in range(len(similar_customers))]

lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient='index')
lookalike_df.columns = ['Lookalike1', 'Lookalike2', 'Lookalike3']

formatted_lookalike_df = lookalike_df.applymap(lambda x: f"{x[0]} ({x[1]:.2f})")

formatted_lookalike_df.to_csv('Lookalike.csv', header=True, index_label='CustomerID')

print("Formatted Lookalike Data:")
print(formatted_lookalike_df)


Formatted Lookalike Data:
         Lookalike1    Lookalike2    Lookalike3
C0001  C0009 (0.62)  C0019 (0.62)  C0014 (0.62)
C0002  C0011 (0.59)  C0016 (0.59)  C0006 (0.59)
C0003  C0007 (0.65)  C0017 (0.65)  C0012 (0.65)
C0004  C0008 (0.98)  C0018 (0.98)  C0013 (0.98)
C0005  C0020 (0.35)  C0015 (0.35)  C0010 (0.35)
C0006  C0016 (1.00)  C0006 (1.00)  C0009 (0.99)
C0007  C0017 (1.00)  C0012 (1.00)  C0003 (0.65)
C0008  C0013 (1.00)  C0008 (1.00)  C0004 (0.98)
C0009  C0019 (1.00)  C0014 (1.00)  C0011 (0.99)
C0010  C0010 (1.00)  C0015 (1.00)  C0005 (0.35)
C0011  C0016 (1.00)  C0006 (1.00)  C0009 (0.99)
C0012  C0017 (1.00)  C0012 (1.00)  C0003 (0.65)
C0013  C0013 (1.00)  C0008 (1.00)  C0004 (0.98)
C0014  C0019 (1.00)  C0014 (1.00)  C0011 (0.99)
C0015  C0010 (1.00)  C0015 (1.00)  C0005 (0.35)
C0016  C0016 (1.00)  C0006 (1.00)  C0009 (0.99)
C0017  C0017 (1.00)  C0012 (1.00)  C0003 (0.65)
C0018  C0013 (1.00)  C0008 (1.00)  C0004 (0.98)
C0019  C0019 (1.00)  C0014 (1.00)  C0011 (0.99)
C0020  C0010 (

  formatted_lookalike_df = lookalike_df.applymap(lambda x: f"{x[0]} ({x[1]:.2f})")
