In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
customers = pd.read_csv(r"C:\Users\sajal\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\sajal\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\sajal\Downloads\Transactions.csv")

In [11]:
transactions_products = transactions.merge(products, on="ProductID", how="left")

# Aggregate transaction history for each customer
customer_transactions = transactions_products.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Category': lambda x: ','.join(x)  # Concatenate categories purchased
}).reset_index()

# Merge customer profile with transaction history
customer_profiles = customers.merge(customer_transactions, on='CustomerID', how='left')

# Fill missing values for customers with no transactions
customer_profiles['TotalValue'] = customer_profiles['TotalValue'].fillna(0)
customer_profiles['Quantity'] = customer_profiles['Quantity'].fillna(0)
customer_profiles['Category'] = customer_profiles['Category'].fillna('')



In [13]:
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'], drop_first=True)

# Convert category string into one-hot encoded features
category_dummies = customer_profiles['Category'].str.get_dummies(sep=',')
customer_profiles = pd.concat([customer_profiles, category_dummies], axis=1).drop(columns=['Category'])

# Normalize numeric columns for similarity calculation
scaler = StandardScaler()
numeric_cols = ['TotalValue', 'Quantity']
customer_profiles[numeric_cols] = scaler.fit_transform(customer_profiles[numeric_cols])

In [15]:
profile_features = customer_profiles.drop(columns=['CustomerID', 'CustomerName', 'SignupDate'])


In [39]:
def find_similar_customers(customer_id, top_n=3):
    # Check if CustomerID exists
    if customer_id not in customer_profiles['CustomerID'].values:
        raise ValueError(f"CustomerID {customer_id} does not exist in the dataset.")
    
    # Get the profile of the input customer
    customer_index = customer_profiles[customer_profiles['CustomerID'] == customer_id].index[0]
    customer_vector = profile_features.iloc[customer_index].values.reshape(1, -1)
    
    # Compute similarity scores
    similarity_scores = cosine_similarity(customer_vector, profile_features)[0]
    
    # Get top N similar customers
    customer_profiles['SimilarityScore'] = similarity_scores
    similar_customers = customer_profiles.sort_values(by='SimilarityScore', ascending=False).head(top_n + 1)
    
    # Exclude the input customer from results
    similar_customers = similar_customers[similar_customers['CustomerID'] != customer_id]
    
    # Return top N similar customers
    return similar_customers[['CustomerID', 'CustomerName', 'SimilarityScore']].head(top_n)


In [41]:
print(customer_profiles['CustomerID'].unique())


['C0001' 'C0002' 'C0003' 'C0004' 'C0005' 'C0006' 'C0007' 'C0008' 'C0009'
 'C0010' 'C0011' 'C0012' 'C0013' 'C0014' 'C0015' 'C0016' 'C0017' 'C0018'
 'C0019' 'C0020' 'C0021' 'C0022' 'C0023' 'C0024' 'C0025' 'C0026' 'C0027'
 'C0028' 'C0029' 'C0030' 'C0031' 'C0032' 'C0033' 'C0034' 'C0035' 'C0036'
 'C0037' 'C0038' 'C0039' 'C0040' 'C0041' 'C0042' 'C0043' 'C0044' 'C0045'
 'C0046' 'C0047' 'C0048' 'C0049' 'C0050' 'C0051' 'C0052' 'C0053' 'C0054'
 'C0055' 'C0056' 'C0057' 'C0058' 'C0059' 'C0060' 'C0061' 'C0062' 'C0063'
 'C0064' 'C0065' 'C0066' 'C0067' 'C0068' 'C0069' 'C0070' 'C0071' 'C0072'
 'C0073' 'C0074' 'C0075' 'C0076' 'C0077' 'C0078' 'C0079' 'C0080' 'C0081'
 'C0082' 'C0083' 'C0084' 'C0085' 'C0086' 'C0087' 'C0088' 'C0089' 'C0090'
 'C0091' 'C0092' 'C0093' 'C0094' 'C0095' 'C0096' 'C0097' 'C0098' 'C0099'
 'C0100' 'C0101' 'C0102' 'C0103' 'C0104' 'C0105' 'C0106' 'C0107' 'C0108'
 'C0109' 'C0110' 'C0111' 'C0112' 'C0113' 'C0114' 'C0115' 'C0116' 'C0117'
 'C0118' 'C0119' 'C0120' 'C0121' 'C0122' 'C0123' 'C

In [51]:
print(find_similar_customers(customer_id='C0001', top_n=3))


    CustomerID   CustomerName  SimilarityScore
173      C0174   Tracy Steele         0.992274
151      C0152   Justin Evans         0.987448
84       C0085  Richard Brown         0.915460


In [55]:
# Scenario 2: Find similar customers for CustomerID = 199
print("\nScenario 2: Similar customers for CustomerID =C0199")
print(find_similar_customers(customer_id='C0199', top_n=3))



Scenario 2: Similar customers for CustomerID =C0199
    CustomerID      CustomerName  SimilarityScore
196      C0197  Christina Harvey         0.999921
68       C0069      Stacy Foster         0.968181
120      C0121     Mark Atkinson         0.896483


In [57]:
# Scenario 3: Find similar customers for CustomerID = 100
print("\nScenario 3: Similar customers for CustomerID = 100")
print(find_similar_customers(customer_id='C0100', top_n=3))


Scenario 3: Similar customers for CustomerID = 100
    CustomerID      CustomerName  SimilarityScore
169      C0170      Logan Harris         0.997609
56       C0057  Elizabeth Nguyen         0.985227
50       C0051    Nicholas Ellis         0.983534


In [59]:
for customer_id in customer_profiles['CustomerID'].iloc[:20]:
    print(f"CustomerID: {customer_id}")
    similar_customers = find_similar_customers(customer_id=customer_id, top_n=3)
    print(similar_customers[['CustomerID', 'SimilarityScore']].to_string(index=False))
    print("-" * 40)

CustomerID: C0001
CustomerID  SimilarityScore
     C0174         0.992274
     C0152         0.987448
     C0085         0.915460
----------------------------------------
CustomerID: C0002
CustomerID  SimilarityScore
     C0159         0.981171
     C0144         0.876713
     C0062         0.870678
----------------------------------------
CustomerID: C0003
CustomerID  SimilarityScore
     C0031         0.938680
     C0129         0.923387
     C0195         0.916365
----------------------------------------
CustomerID: C0004
CustomerID  SimilarityScore
     C0012         0.978457
     C0148         0.961780
     C0113         0.928956
----------------------------------------
CustomerID: C0005
CustomerID  SimilarityScore
     C0140         0.991344
     C0007         0.986908
     C0095         0.896656
----------------------------------------
CustomerID: C0006
CustomerID  SimilarityScore
     C0187         0.992461
     C0108         0.943868
     C0048         0.891310
---------------

In [69]:
# Create the lookalike map
lookalike_map = {}

# Iterate through the first 20 customers (C0001 to C0020)
for customer_id in customer_profiles['CustomerID'].iloc[:20]:
    similar_customers = find_similar_customers(customer_id=customer_id, top_n=3)
    lookalike_map[customer_id] = similar_customers[['CustomerID', 'SimilarityScore']].values.tolist()


lookalike_df = pd.DataFrame([
    {'cust_id': cust_id, 'lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
])

# Save the DataFrame Excel file
lookalike_df.to_excel(r"C:\Users\sajal\Downloads\lookalike.xlsx", index=False)

In [None]:
#data is saved to lookalive.xlsx
