In [4]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.1 MB 310.6 kB/s eta 0:00:35
   --- ------------------------------------ 1.0/11.1 MB 729.5 kB/s eta 0:00:14
   ----- -----------


[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


### Importing Important Library ###

In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

### Load the data ###

In [None]:
# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

### Data preprocessing ###

In [None]:
# Data Preprocessing
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
# Merge customers and transactions
data = pd.merge(transactions, customers, on='CustomerID', how='left')

In [None]:
# Feature Engineering
customer_features = data.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    unique_products=('ProductID', 'nunique')
).reset_index()

# Add customer demographics (e.g., region, signup date)
customer_features = pd.merge(customer_features, customers[['CustomerID', 'Region']], on='CustomerID', how='left')

In [None]:
# Normalize features for KNN
scaler = StandardScaler()
customer_features[['total_spend', 'transaction_count', 'unique_products']] = scaler.fit_transform(
    customer_features[['total_spend', 'transaction_count', 'unique_products']]
)

### KNN model building ###

In [None]:
# Create feature matrix for KNN model
X = customer_features[['total_spend', 'transaction_count', 'unique_products']].values

# Fit the KNN model (finding the nearest neighbors)
knn = NearestNeighbors(n_neighbors=4, metric='euclidean')
knn.fit(X)

### Similar customer around ###

In [9]:
# Get top 3 similar customers for each customer in the list C0001 - C0020
lookalike_map = {}
for i, customer_id in enumerate(customer_features['CustomerID']):
    if f'C{i+1:04d}' in customers['CustomerID'].values:  # Check if the customer is in the list of interest
        distances, indices = knn.kneighbors([X[i]], n_neighbors=4)  # Get 4 nearest neighbors (including self)
        similar_customers_idx = indices[0][1:]  # Exclude the customer itself
        similar_customers = customer_features.iloc[similar_customers_idx]['CustomerID'].values
        similarity_scores = 1 / (1 + distances[0][1:])  # Inverse of distance for similarity (scaled to [0,1])
        lookalike_map[customer_id] = list(zip(similar_customers, similarity_scores))

# Save the result to a CSV file
lookalike_df = pd.DataFrame([
    {'CustomerID': key, 'Lookalike_Customers': val} for key, val in lookalike_map.items()
])
lookalike_df.to_csv('Lookalike.csv', index=False)


In [10]:
lookalike_df

Unnamed: 0,CustomerID,Lookalike_Customers
0,C0001,"[(C0137, 0.9881245646606442), (C0152, 0.983145..."
1,C0002,"[(C0029, 0.959646968784458), (C0199, 0.9400701..."
2,C0003,"[(C0178, 0.9910279499495853), (C0035, 0.929917..."
3,C0004,"[(C0021, 0.9656437882981692), (C0173, 0.801352..."
4,C0005,"[(C0073, 0.9858172660791746), (C0159, 0.979547..."
...,...,...
194,C0196,"[(C0168, 0.6775820119641452), (C0079, 0.617113..."
195,C0197,"[(C0131, 0.9917699154396973), (C0112, 0.983399..."
196,C0198,"[(C0128, 0.9201492898419034), (C0015, 0.890126..."
197,C0199,"[(C0031, 0.9859236013803505), (C0192, 0.951371..."
