<p>Importing all the necessary libraries and dataset path:</p>

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
customers_df=pd.read_csv("D:/sahyadri/8-sem/Customers.csv")
products_df=pd.read_csv("D:/sahyadri/8-sem/Products.csv")
transactions_df=pd.read_csv("D:/sahyadri/8-sem/Transactions.csv")

<p>Preprocess - Conversions:</p>

In [21]:
customers_df['SignupDate']=pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate']=pd.to_datetime(transactions_df['TransactionDate'])

<p>Merging transactions with product details on its ID and aggregating customer transactions:</p>

In [22]:
transactions_with_products=transactions_df.merge(products_df, on="ProductID")
customer_transactions=transactions_with_products.groupby("CustomerID").agg({
    "Category": lambda x: x.value_counts().to_dict(),"TotalValue":"sum","Quantity":"sum",}).reset_index()
customer_profiles=customers_df.merge(customer_transactions,on="CustomerID",how="left")
customer_profiles.fillna({"Category":{},"TotalValue":0,"Quantity":0},inplace=True)
customer_profiles.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,Category,TotalValue,Quantity
0,C0001,Lawrence Carroll,South America,2022-07-10,"{'Electronics': 3, 'Books': 1, 'Home Decor': 1}",3354.52,12.0
1,C0002,Elizabeth Lutz,Asia,2022-02-13,"{'Home Decor': 2, 'Clothing': 2}",1862.74,10.0
2,C0003,Michael Rivera,South America,2024-03-07,"{'Home Decor': 2, 'Clothing': 1, 'Electronics'...",2725.38,14.0
3,C0004,Kathleen Rodriguez,South America,2022-10-09,"{'Books': 3, 'Home Decor': 3, 'Electronics': 2}",5354.88,23.0
4,C0005,Laura Weber,Asia,2022-08-15,"{'Electronics': 2, 'Home Decor': 1}",2034.24,7.0


In [23]:
customer_profiles["Category"]=customer_profiles["Category"].apply(lambda x: x if isinstance(x,dict) else {})

In [24]:
def category_to_string(category_dict):
    return " ".join([f"{k}_{v}" for k, v in category_dict.items()])
customer_profiles["CategoryStr"]=customer_profiles["Category"].apply(category_to_string)

<p>Vectorizing data and normalizing features:<p>

In [25]:
vectorizer=CountVectorizer()
category_vectors=vectorizer.fit_transform(customer_profiles["CategoryStr"])

In [26]:
numerical_features=customer_profiles[["TotalValue","Quantity"]].values
scaler=MinMaxScaler()
scaled_features=scaler.fit_transform(numerical_features)

<p>Combining vectors and scaled features, then finding similarity matrices:</p>

In [27]:
final_features=np.hstack([category_vectors.toarray(),scaled_features])

In [28]:
similarity_matrix=cosine_similarity(final_features)

<p>Now we finally generate lookalike recommendations:</p>

In [29]:
customer_ids=customer_profiles["CustomerID"].values
top_customers={}

In [30]:
for i in range(20):
    customer_id=customer_ids[i]
    similarities=similarity_matrix[i]
    similar_indices=np.argsort(similarities)[::-1][1:4]
    top_customers[customer_id]=[(customer_ids[idx],round(similarities[idx],4)) for idx in similar_indices]

<p>We will create a dataframe for lookalike recommendations and save it as .csv file:</p>

In [31]:
lookalike_df=pd.DataFrame({"CustomerID":list(top_customers.keys()),"Top3_Lookalikes":[str(v) for v in top_customers.values()]})
lookalike_csv_path="D:/sahyadri/8-sem/Lookalike.csv"
lookalike_df.to_csv(lookalike_csv_path, index=False)
print(f"Lookalike recommendations saved to: {lookalike_csv_path}")

Lookalike recommendations saved to: D:/sahyadri/8-sem/Lookalike.csv


<h3>Viewing the Recommendations</h3>

In [15]:
lookalike_df=pd.read_csv(lookalike_csv_path)

In [16]:
lookalike_df.head(20)

Unnamed: 0,CustomerID,Top3_Lookalikes
0,C0001,"[('C0184', 0.899), ('C0123', 0.8612), ('C0015'..."
1,C0002,"[('C0133', 0.9984), ('C0106', 0.8717), ('C0103..."
2,C0003,"[('C0166', 0.9989), ('C0031', 0.9977), ('C0158..."
3,C0004,"[('C0047', 0.7624), ('C0191', 0.7378), ('C0175..."
4,C0005,"[('C0007', 0.9994), ('C0197', 0.9994), ('C0035..."
5,C0006,"[('C0135', 0.9984), ('C0057', 0.9017), ('C0118..."
6,C0007,"[('C0005', 0.9994), ('C0197', 0.9992), ('C0026..."
7,C0008,"[('C0054', 0.8178), ('C0184', 0.8067), ('C0105..."
8,C0009,"[('C0040', 0.9706), ('C0029', 0.814), ('C0020'..."
9,C0010,"[('C0056', 0.8274), ('C0013', 0.8204), ('C0030..."


<h3>Testing the Recommendations</h3>

In [17]:
def analyze_customer(customer_id):
    customer_profile=customer_profiles[customer_profiles["CustomerID"]==customer_id]
    lookalikes=lookalike_df[lookalike_df["CustomerID"]==customer_id]["Top3_Lookalikes"].values[0]    
    return {"CustomerProfile":customer_profile[["CustomerID","TotalValue","Quantity","CategoryStr"]].iloc[0].to_dict(),
        "Lookalikes": lookalikes}
example_customer_id=customer_ids[0]
example_analysis=analyze_customer(example_customer_id)
example_analysis

{'CustomerProfile': {'CustomerID': 'C0001',
  'TotalValue': 3354.5200000000004,
  'Quantity': 12.0,
  'CategoryStr': 'Electronics_3 Books_1 Home Decor_1'},
 'Lookalikes': "[('C0184', 0.899), ('C0123', 0.8612), ('C0015', 0.86)]"}

In [18]:
lookalike_ids = ['C0184', 'C0123', 'C0015']
lookalike_profiles = customer_profiles[customer_profiles["CustomerID"].isin(lookalike_ids)]
lookalike_profiles[["CustomerID", "TotalValue", "Quantity", "CategoryStr"]]

Unnamed: 0,CustomerID,TotalValue,Quantity,CategoryStr
14,C0015,1157.48,4.0,Books_1 Home Decor_1
122,C0123,1400.06,4.0,Books_1 Home Decor_1
183,C0184,3393.18,11.0,Electronics_3 Clothing_2 Home Decor_1 Books_1
