In [42]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

customers = pd.read_csv(r'C:\Users\Lenovo\Dropbox\PC\Downloads\Customers.csv')
products = pd.read_csv(r'C:\Users\Lenovo\Dropbox\PC\Downloads\Products.csv')
transactions = pd.read_csv(r'C:\Users\Lenovo\Dropbox\PC\Downloads\Transactions.csv')

merged_data = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')
print("Merged Data Sample:")
print(merged_data.head())

customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum', 
    'Quantity': 'sum', 
    'ProductID': lambda x: len(set(x)), 
    'Region': 'first' 
}).reset_index()
print("\nCustomer Profiles Sample:")
print(customer_profiles.head())

customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'])
print("\nCustomer Profiles with One-Hot Encoding:")
print(customer_profiles.head())

scaler = StandardScaler()
scaled_data = scaler.fit_transform(customer_profiles.drop(['CustomerID'], axis=1))
print("\nScaled Data Sample:")
print(scaled_data[:5])

kmeans = KMeans(n_clusters=5, random_state=42)  
customer_profiles['Cluster'] = kmeans.fit_predict(scaled_data)
print("\nCustomer Profiles with Clusters:")
print(customer_profiles[['CustomerID', 'Cluster']].head())

similarity_matrix = cosine_similarity(scaled_data)
print("\nCosine Similarity Matrix Sample:")
print(similarity_matrix[:5, :5]) 

lookalike_results = {}
for i, customer_id in enumerate(customer_profiles['CustomerID']):
    similar_indices = similarity_matrix[i].argsort()[-4:-1]  # Top 3 excluding self
    lookalike_results[customer_id] = [
        (customer_profiles.iloc[j]['CustomerID'], similarity_matrix[i][j]) for j in similar_indices
    ]

lookalike_subset = {k: lookalike_results[k] for k in customer_profiles['CustomerID'][:20]}
print("\nLookalike Recommendations for C0001 to C0020:")
for cust_id, recommendations in lookalike_subset.items():
    print(f"Customer {cust_id}: {recommendations}")

lookalike_df = pd.DataFrame.from_dict(lookalike_subset, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])

for i in range(1, 4):
    lookalike_df[f'Lookalike{i}_Score'] = lookalike_df.apply(
        lambda row: row[f'Lookalike{i}'][1] if isinstance(row[f'Lookalike{i}'], tuple) else None, axis=1
    )

lookalike_df.to_csv('Lookalike.csv', index=True)
print("\nLookalike Recommendations saved to Lookalike.csv")


Merged Data Sample:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLivi

