# Customer Purchase Prediction using Machine Learning Models.

In [None]:
from datetime import datetime, timedelta, date
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, train_test_split

## IMPORT DATA

In [None]:

# Import data 
import pandas as pd

# Read excel file
df_retail = pd.read_csv('C:/Users/Darius/Desktop/PROJECT//online_retail_II.csv')


df_retail = df_retail.rename(columns={'Customer ID': 'CustomerID'})

# Convert 'InvoiceDate' column to datetime format 
# and display information about the 'df_retail' DataFrame
df_retail['InvoiceDate'] = pd.to_datetime(df_retail['InvoiceDate'])

# extract only the date part and convert it to date data type
df_retail['InvoiceDate'] = df_retail['InvoiceDate'].dt.date

df_retail


## Filter Data from UK

In [None]:
# filter data from UK
tx_uk = df_retail.query("Country == 'United Kingdom'").reset_index()

In [None]:
tx_uk

## CHOOSE DATA FOR TRAINING

In [None]:
import pandas as pd

# Convert the Python date objects to Pandas datetime objects
start_date = pd.to_datetime('2010-09-01')
end_date = pd.to_datetime('2011-09-01')

# Filter 6 months from 03/2011 to 09/2011
tx_12m = tx_uk[(tx_uk.InvoiceDate < end_date) & (tx_uk.InvoiceDate >= start_date)].reset_index(drop = True)
tx_12m

### CHOOSE DATA FOR TEST SET

In [None]:
# Convert the Python date objects to Pandas datetime objects
start_date = pd.to_datetime('2011-09-01')
end_date = pd.to_datetime('2011-12-01')

# Filter 6 months from 06/2011 to 12/2011
tx_next = tx_uk[(tx_uk.InvoiceDate < end_date) & (tx_uk.InvoiceDate >= start_date)].reset_index(drop = True)
tx_next

In [None]:
# Describe the tx_next dataframe
tx_next['InvoiceDate'].describe()

In [None]:
# Create a new DataFrame called tx_user that contains a list of unique customer IDs from the tx_12m DataFrame
tx_user = pd.DataFrame(tx_12m['CustomerID'].unique())

# Rename the column in tx_user to 'CustomerID'
tx_user.columns =['CustomerID']

# Create a new DataFrame called tx_next_first_purchase that contains the earliest purchase date for each customer in the tx_next DataFrame
tx_next_first_purchase = tx_next.groupby('CustomerID').InvoiceDate.min().reset_index()

# Rename the columns in tx_next_first_purchase to 'CustomerID' and 'MinPurchaseDate'
tx_next_first_purchase.columns = ['CustomerID', 'MinPurchaseDate']

# Print the first few rows of the tx_next_first_purchase DataFrame
tx_next_first_purchase.head()


In [None]:
# Create a new DataFrame called tx_last_purchase that contains the latest purchase date for each customer in the tx_12m DataFrame
tx_last_purchase = tx_12m.groupby('CustomerID').InvoiceDate.max().reset_index()

# Rename the columns in tx_last_purchase to 'CustomerID' and 'MaxPurchaseDate'
tx_last_purchase.columns = ['CustomerID', 'MaxPurchaseDate']


In [None]:
# Merge the tx_last_purchase and tx_next_first_purchase DataFrames on the 'CustomerID' column using a left join
tx_purchase_dates = pd.merge(tx_last_purchase, tx_next_first_purchase, on ='CustomerID', how = 'left')


In [None]:
tx_purchase_dates

In [None]:
# Calculate the number of days between the latest purchase date and the earliest purchase date for each customer
tx_purchase_dates['NextPurchaseDay'] = (tx_purchase_dates['MinPurchaseDate'] - tx_purchase_dates['MaxPurchaseDate']).dt.days


In [None]:
tx_purchase_dates

In [None]:
# Merge the tx_user DataFrame with the tx_purchase_dates DataFrame on the 'CustomerID' column using a left join
tx_user = pd.merge(tx_user, tx_purchase_dates[['CustomerID','NextPurchaseDay']],  on = 'CustomerID', how ='left')

# Show the first 5 rows of the resulting DataFrame
tx_user.head()


In [None]:
# Replace missing values (NaNs) in the 'NextPurchaseDay' column with 999
tx_user =tx_user.fillna(999)


In [None]:
tx_user


In [None]:
# Calculate the number of days between each customer's last purchase and the end of the 12-month transaction period, and add the result as a new 'Recency' column to the tx_last_purchase DataFrame
tx_last_purchase['Recency'] = (tx_last_purchase['MaxPurchaseDate'].max() - tx_last_purchase['MaxPurchaseDate']).dt.days

# Merge the tx_user DataFrame with the tx_last_purchase DataFrame on the 'CustomerID' column
tx_user = pd.merge(tx_user, tx_last_purchase[['CustomerID', 'Recency']], on = 'CustomerID')

# Show the first 5 rows of the resulting DataFrame
tx_user.head()


In [None]:
#describe data 
tx_user.Recency.describe()

### Number of Clusters

In [None]:
# Import necessary modules
import plotly.graph_objs as go
import plotly.offline as pyoff
from sklearn.cluster import KMeans

# Initialize an empty list of SSE values
sse = [0] * 10

# Extract the Recency column from the df_uk_user_q3 DataFrame
tx_recency = tx_user[['Recency']]

for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
    tx_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_
# Create a scatter plot of SSE values for each k
plot_data = [ 
    go.Scatter(
        x = list(range(1, 10)),
        y = sse[1:10]
    )
]

# Set the plot layout
plot_layout = go.Layout(
    title = 'Number of Clusters of Recency in Q2 and Q3',
    xaxis = {'title': 'Number of clusters'},
    yaxis = {'title': 'SSE'}
)

# Create the plot figure
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.plot(fig)



In [None]:
kmeans = KMeans(n_clusters = 4)
kmeans.fit(tx_user[['Recency']])
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])

def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by = target_field_name, ascending = ascending).reset_index(drop = True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name], axis = 1)
    df_final = df_final.rename(columns = {"index": cluster_field_name})
    return df_final

print(tx_user)
tx_user = order_cluster('RecencyCluster', 'Recency', tx_user, False)
tx_user.groupby('RecencyCluster')['Recency'].describe()

In [None]:
# Frequency
tx_frequency = tx_12m.groupby('CustomerID').InvoiceDate.count().reset_index()
tx_frequency.columns = ['CustomerID', 'Frequency']
tx_frequency.head()

tx_user = pd.merge(tx_user, tx_frequency, on = 'CustomerID')
tx_user.head()
tx_user.Frequency.describe()



In [None]:
# Import necessary modules
import plotly.graph_objs as go
import plotly.offline as pyoff
from sklearn.cluster import KMeans

# Initialize an empty list of SSE values
sse = [0] * 10

# Extract the Recency column from the df_uk_user_q3 DataFrame
tx_frequency = tx_user[['Frequency']]

for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_frequency)
    tx_frequency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_
# Create a scatter plot of SSE values for each k
plot_data = [ 
    go.Scatter(
        x = list(range(1, 10)),
        y = sse[1:10]
    )
]

# Set the plot layout
plot_layout = go.Layout(
    title = 'Number of Clusters of Frequency in Q2 and Q3',
    xaxis = {'title': 'Number of clusters'},
    yaxis = {'title': 'SSE'}
)

# Create the plot figure
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.plot(fig)

kmeans = KMeans(n_clusters = 4)
kmeans.fit(tx_user[['Frequency']])
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])

tx_user = order_cluster('FrequencyCluster', 'Frequency', tx_user, True)
tx_user.groupby('FrequencyCluster')['Frequency'].describe()

In [None]:
# Monetary Value

tx_12m['Revenue'] = tx_12m['Price']*tx_12m['Quantity']
tx_revenue = tx_12m.groupby('CustomerID').Revenue.sum().reset_index()

tx_user = pd.merge(tx_user, tx_revenue, on = 'CustomerID', how='left')
tx_user.head()
tx_user.Revenue.describe()



In [None]:
# Import necessary modules
import plotly.graph_objs as go
import plotly.offline as pyoff
from sklearn.cluster import KMeans

# Initialize an empty list of SSE values
sse = [0] * 10

# Extract the Recency column from the df_uk_user_q3 DataFrame
tx_revenue = tx_user[['Revenue']]

for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_revenue)
    tx_revenue["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_
# Create a scatter plot of SSE values for each k
plot_data = [ 
    go.Scatter(
        x = list(range(1, 10)),
        y = sse[1:10]
    )
]

# Set the plot layout
plot_layout = go.Layout(
    title = 'Number of Clusters of Revenue in Q2 and Q3',
    xaxis = {'title': 'Number of clusters'},
    yaxis = {'title': 'SSE'}
)

# Create the plot figure
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.plot(fig)

kmeans = KMeans(n_clusters = 4)
kmeans.fit(tx_user[['Revenue']])
tx_user['RevenueCluster'] = kmeans.predict(tx_user[['Revenue']])

tx_user = order_cluster('RevenueCluster', 'Revenue', tx_user, True)
tx_user.groupby('RevenueCluster')['Revenue'].describe()

In [None]:
tx_user

In [None]:
#Overall Score

tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster']+tx_user['RevenueCluster']
tx_user.groupby('OverallScore')['Recency','Frequency','Revenue'].mean()

tx_user.groupby('OverallScore')['Recency'].count()

tx_user['Segment'] = 'Low-Value'
tx_user.loc[tx_user['OverallScore'] > 2, 'Segment'] = 'Mid-Value'
tx_user.loc[tx_user['OverallScore'] > 4, 'Segment'] = 'High-Value'

In [None]:
tx_user

## ADDING NEW FEATURES TO MODEL

In [None]:
# Adding new Features
# Create a dataframe with CustomerID, and Invoice Date

tx_day_order = tx_12m[['CustomerID', 'InvoiceDate']]

# Convert Invoice Date to day

tx_day_order['InvoiceDay'] = pd.to_datetime(tx_12m['InvoiceDate']).dt.date

tx_day_order = tx_day_order.sort_values(['CustomerID', 'InvoiceDate'])

# Drop Duplicates (If Customer buy more than 2 times in a day)
tx_day_order = tx_day_order.drop_duplicates(subset = ['CustomerID', 'InvoiceDay'], keep = 'first')

# Shifting last 3 purchase dates
tx_day_order['PrevInvoiceDate'] = tx_day_order.groupby('CustomerID')['InvoiceDay'].shift(1)
tx_day_order['T2InvoiceDate'] = tx_day_order.groupby('CustomerID')['InvoiceDay'].shift(2)
tx_day_order['T3InvoiceDate'] = tx_day_order.groupby('CustomerID')['InvoiceDay'].shift(3)

tx_day_order.head()


In [None]:
# We will base on 4 nearest purchasing days to build the model 
tx_day_order['DayDiff'] = (tx_day_order['InvoiceDay'] - tx_day_order['PrevInvoiceDate']).dt.days
tx_day_order['DayDiff2'] = (tx_day_order['InvoiceDay'] - tx_day_order['T2InvoiceDate']).dt.days
tx_day_order['DayDiff3'] = (tx_day_order['InvoiceDay'] - tx_day_order['T3InvoiceDate']).dt.days

tx_day_order.head()

In [None]:
tx_day_diff = tx_day_order.groupby('CustomerID').agg({'DayDiff': ['mean', 'std']}).reset_index()
tx_day_diff.columns = ['CustomerID', 'DayDiffMean', 'DayDiffStd']
tx_day_diff.head()

In [None]:
tx_day_order_last = tx_day_order.drop_duplicates(subset = ['CustomerID'], keep ='last')
tx_day_order_last.head()

In [None]:
tx_day_order_last = tx_day_order_last.dropna()
tx_day_order_last = pd.merge(tx_day_order_last, tx_day_diff, on = 'CustomerID')
tx_user = pd.merge(tx_user, tx_day_order_last[['CustomerID', 'DayDiff', 'DayDiff2', 'DayDiff3', 'DayDiffMean', 'DayDiffStd']], on ='CustomerID')

tx_user.head()

In [None]:
tx_day_order_last


In [None]:
tx_user

In [None]:
# Grouping the label
import pandas as pd
tx_class = tx_user.copy()
tx_class = pd.get_dummies(tx_class)

tx_class.info()

Sửa threshold


In [None]:

tx_class['NextPurchaseDayRange'] = 1
tx_class.loc[tx_class.NextPurchaseDay >90, 'NextPurchaseDayRange'] = 0



tx_class.NextPurchaseDayRange.value_counts()




In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
corr = tx_class[tx_class.columns].corr()
plt.figure(figsize = (30,20))
sns.heatmap(corr, annot =True, linewidths = 0.2, fmt = '.2f')



In [None]:
tx_class = tx_class.drop('NextPurchaseDay', axis = 1)
X, y = tx_class.drop('NextPurchaseDayRange', axis = 1), tx_class.NextPurchaseDayRange

thay đổi random state


In [None]:
# Import libraries
from datetime import datetime, timedelta, date
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)
models = []
models.append(("LR", LogisticRegression()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))
models.append(('SVC',SVC()))
models.append(('Dtree', DecisionTreeClassifier()))
models.append(('XGB', xgb.XGBClassifier()))
models.append(('KNN', KNeighborsClassifier()))

for name, model in models:
    kfold = KFold(n_splits=2)
    cv_result = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
    print(name, cv_result)
                
ltv_xgb_model = xgb.XGBClassifier(learning_rate= 0.01, max_depth= 3, n_estimators=50).fit(X_train, y_train)

print('Accuracy of XGB Classifier on training set: {:.2f}'.format(ltv_xgb_model.score(X_train, y_train)))
print('Accuracy of XGB Classifier on test set : {:.2f}'.format(ltv_xgb_model.score(X_test[X_train.columns], y_test)))


              

Kết quả

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = ltv_xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
for state in range(30, 61):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=state)
    xgb_model = xgb.XGBClassifier().fit(X_train, y_train)
    print('Random State:', state)
    print('Accuracy of XGB Classifier on training set: {:.2f}'.format(xgb_model.score(X_train, y_train)))
    print('Accuracy of XGB Classifier on test set : {:.2f}'.format(xgb_model.score(X_test[X_train.columns], y_test)))


In [None]:
# Create an instance of the XGBClassifier
xgb_model = XGBClassifier(learning_rate=0.1, max_depth= 3, n_estimators= 50)

# Set up the random seed
np.random.seed(52)

# Define the number of iterations to run the model
n_iterations = 100

# Create empty lists to store accuracy scores for train and test sets
train_scores = []
test_scores = []

# Loop through the number of iterations
for i in range(n_iterations):
    
    # Randomly split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Fit the model to the training data
    xgb_model.fit(X_train, y_train)
    
    # Calculate the accuracy score for the training data
    train_score = xgb_model.score(X_train, y_train)
    
    # Calculate the accuracy score for the testing data
    test_score = xgb_model.score(X_test, y_test)
    
    # Add the accuracy scores to the respective lists
    train_scores.append(train_score)
    test_scores.append(test_score)

# Calculate the mean accuracy score for the training data
mean_train_score = np.mean(train_scores)

# Calculate the mean accuracy score for the testing data
mean_test_score = np.mean(test_scores)

# Print out the mean accuracy scores
print(f"Mean accuracy score for training set over {n_iterations} iterations: {mean_train_score:.4f}")
print(f"Mean accuracy score for testing set over {n_iterations} iterations: {mean_test_score:.4f}")

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [50, 100, 200],
              'max_depth': [3, 5, 7],
              'learning_rate': [0.01, 0.1, 0.3]}
              
xgb_model = XGBClassifier()
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print('Best parameters:', grid_search.best_params_)
print('Best accuracy score:', grid_search.best_score_)
