In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
df = pd.read_csv('/kaggle/input/online-retail-dataset/online_retail.csv', encoding='ISO-8859-1')
print(df.head())
print(df.dtypes)

In [None]:
#DATA CLEANING
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
print(df.dtypes)

In [None]:
df['TotalAmount'] = df['Quantity'] * df['UnitPrice']
reference_date = df['InvoiceDate'].max() + pd.DateOffset(1)
df['last_purchase_date'] = (reference_date - df['InvoiceDate']).dt.days

df['Frequency'] = df.groupby('CustomerID')['InvoiceNo'].transform('nunique')

In [None]:
#Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(df['last_purchase_date'], kde = True)
plt.title('Distribution of last purchase dates')
plt.show()

plt.figure(figsize=(12, 6))
sns.kdeplot(df['last_purchase_date'], shade=True)
plt.title('Distribution of last purchase dates')
plt.show()

sns.histplot(df['Frequency'], kde=True)
plt.title('Distribution of Frequency')
plt.xlabel('Frequency')
plt.ylabel('Density')
plt.show()

In [None]:
#CLUSTERING
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[['last_purchase_date', 'Frequency', 'TotalAmount']])

In [None]:
#K-Means CLustering
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 3)
kmeans.fit(df_scaled)

df['Cluster'] = kmeans.labels_

In [None]:
#Visualizing Clusters
plt.figure(figsize=(12, 6))
sns.scatterplot(x='last_purchase_date', y='Frequency', hue='Cluster' , data=df)
plt.title('Customer Segments')
plt.xlabel('Last Purchase Date')
plt.ylabel('Frequncy')
plt.show()


In [None]:
#Analysing
cluster_summary = df.groupby('Cluster').agg({
    'last_purchase_date': 'mean',
    'Frequency': 'mean',
    'TotalAmount': 'mean',
    'CustomerID': 'count'
}).rename(columns={'CustomerID': 'Count'}).reset_index()

print("Cluster Summary:")
print(cluster_summary)

In [None]:
#adding recomendations for specific customer

def recommendation(customerID, df, cluster_column='Cluster', num_recommendations=5):
    
    cust_cluster = df.loc[df['CustomerID'] == customerID, cluster_column].iloc[0]
    
    similar_customer = df[df[cluster_column] == cust_cluster]
    
    items = similar_customer.groupby('Description')['Quantity'].sum().sort_values(ascending=False).head(num_recommendations).index
    
    recommendations = list(items)
    print("Top", num_recommendations, "recommendations for Customer", customerID, ":")
    for item in recommendations:
        print("-", item)
    
    return recommendations

# Example: Recommend items for a specific customer
CustomerID = '17850'  # Example customer ID
recommendations = recommendation(customerID, df)