In [22]:
# Import libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [23]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [24]:


# Step 1: Data Preprocessing
# Convert SignupDate and TransactionDate to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [25]:
 # Aggregate transaction data to create customer features
customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total revenue per customer
    'TransactionID': 'count',  # Total transactions per customer
    'Quantity': 'sum'  # Total quantity purchased
}).reset_index()

In [26]:

customer_features = pd.merge(customers, customer_transactions, on='CustomerID', how='left')
customer_features.fillna(0, inplace=True)

In [27]:
# Step 2: Feature Scaling
# Select numerical features for similarity computation
feature_cols = ['TotalValue', 'TransactionID', 'Quantity'] + list(customer_features.columns[5:])
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[feature_cols])

In [28]:


# Step 3: Compute Similarity
# Compute cosine similarity between all customers
similarity_matrix = cosine_similarity(scaled_features)

In [29]:
# Step 4: Generate Lookalike Recommendations
# Create a dataframe to store lookalike results
lookalike_results = {}

for i in range(20):  # For customers C0001 to C0020
    customer_index = i  # Assuming the first 20 customers are at the top of the dataframe
    similarity_scores = list(enumerate(similarity_matrix[customer_index]))
    # Sort customers by similarity score (excluding the customer itself)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = similarity_scores[1:4]  # Exclude the customer itself (index 0)
    lookalike_results[customer_features.loc[customer_index, 'CustomerID']] = [
        (customer_features.loc[idx, 'CustomerID'], round(score, 4)) for idx, score in top_3
    ]

In [30]:
# Step 5: Save Lookalike Recommendations to CSV
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_results.items()
])

In [31]:


# Format the Lookalikes column for CSV output
lookalike_df['Lookalikes'] = lookalike_df['Lookalikes'].apply(
    lambda x: ', '.join([f"({c}, {s})" for c, s in x])
)

In [39]:

# Step 4: Save results to Lookalike.csv
lookalike_df = pd.DataFrame(
    results,
    columns=["CustomerID", "Lookalike1", "Score1", "Lookalike2", "Score2", "Lookalike3", "Score3"],
)
lookalike_df.to_csv(r'D:\Users\G.Greeshma\Downloads\FirstName_LastName_Lookalike.csv')

In [35]:
# Step 6: Output Results
print("Top 3 Lookalikes for the First 20 Customers:")
lookalike_df


Top 3 Lookalikes for the First 20 Customers:


Unnamed: 0,CustomerID,Lookalike1,Score1,Lookalike2,Score2,Lookalike3,Score3
0,C0001,C0174,0.999996,C0088,0.999963,C0106,0.999931
1,C0002,C0025,0.999981,C0029,0.999969,C0121,0.999814
2,C0003,C0052,0.999871,C0031,0.999787,C0006,0.999725
3,C0004,C0104,0.999966,C0173,0.99994,C0165,0.999904
4,C0005,C0159,0.999996,C0073,0.999979,C0176,0.999979
5,C0006,C0177,0.999999,C0092,0.999914,C0026,0.999868
6,C0007,C0132,0.999886,C0032,0.999834,C0197,0.99982
7,C0008,C0141,0.999826,C0156,0.999821,C0075,0.99971
8,C0009,C0077,0.99959,C0043,0.999297,C0131,0.998235
9,C0010,C0166,0.999997,C0192,0.999681,C0094,0.999659
