In [1]:
import pandas as pd 
import numpy as np
import random
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors 
from sklearn.preprocessing import normalize 
import joblib

RND = 42
random.seed(RND)
np.random.seed(RND)

In [2]:
## Loading the dataset 
Dataset = "online_retail.xlsx"

df = pd.read_excel(Dataset)
print("shape", df.shape)
display(df.head())
print(df.columns.tolist())

shape (541910, 8)


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'Price', 'Customer ID', 'Country']


In [3]:
## dropping row with customer id
df = df.dropna(subset=['Customer ID']) ##dropping that rows which has null value in customerid 

## converting the types 
df['Customer ID'] = df['Customer ID'].astype(int).astype(str)
df = df[df['Quantity']>0]

### check that the stockcode and description are string

df['StockCode'] = df['StockCode'].astype(str)
df['Description'] = df['Description'].astype(str)

## renaming 
df = df.rename(columns={
    'Customer ID': 'CustomerID',
    'Invoice': 'InvoiceNo',
    'price' : 'UnitPrice'
})

print("---After Cleaning---------", df.shape)

df.head()

---After Cleaning--------- (397925, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,Price,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [4]:
## item frequency by different customers 
item_user_counts = df.groupby('StockCode')['CustomerID'].nunique() ### we group the df by the StockCode and then see the customerIDs those ids that purcahsed that unique item 
items_to_keep = item_user_counts[item_user_counts >= 5].index ### here it counts the customersid and and keeps only those ids that item has been puchcased by atleast 5 or more cutsomers
df = df[df['StockCode'].isin(items_to_keep)]### 
# Items bought by only one or two customers are considered "rare items." 
# They often don't have enough data to establish a meaningful purchase pattern, 
# making them difficult to recommend accurately Removing them reduces noise and improves the performance of collaborative filtering algorithms.

##user frequency by distinct items 
# It filters this Series, keeping only the customers who have purchased 2 or more distinct items.
user_item_counts = df.groupby('CustomerID')['StockCode'].nunique()
users_to_keep = user_item_counts[user_item_counts >= 2].index
df = df[df["CustomerID"].isin(users_to_keep)]
# Customers who have only purchased one item are considered "cold start users" or customers with rare interactions. They don't provide enough interaction data to accurately determine
# their preferences or relate them to other customers. Removing them also simplifies the model and makes the remaining
# customer profiles more robust.


print("After filtering rare items/users:", df.shape)


After filtering rare items/users: (396520, 8)


In [6]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,Price,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
