In [1]:
# All imports required 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,cross_val_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,Dropout,BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import pickle as pkl

## Exploratory Data Analysis

In [2]:
df=pd.read_csv(r"C:\Users\dkdes\OneDrive\Desktop\kaggle_datasets\customer_purchase_data.csv")
df.head()

Unnamed: 0,Age,Gender,AnnualIncome,NumberOfPurchases,ProductCategory,TimeSpentOnWebsite,LoyaltyProgram,DiscountsAvailed,PurchaseStatus
0,40,1,66120.267939,8,0,30.568601,0,5,1
1,20,1,23579.773583,4,2,38.240097,0,5,0
2,27,1,127821.306432,11,2,31.633212,1,0,1
3,24,1,137798.62312,19,3,46.167059,0,4,1
4,31,1,99300.96422,19,1,19.823592,0,0,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 1500 non-null   int64  
 1   Gender              1500 non-null   int64  
 2   AnnualIncome        1500 non-null   float64
 3   NumberOfPurchases   1500 non-null   int64  
 4   ProductCategory     1500 non-null   int64  
 5   TimeSpentOnWebsite  1500 non-null   float64
 6   LoyaltyProgram      1500 non-null   int64  
 7   DiscountsAvailed    1500 non-null   int64  
 8   PurchaseStatus      1500 non-null   int64  
dtypes: float64(2), int64(7)
memory usage: 105.6 KB


In [4]:
X=df.drop("PurchaseStatus",axis=1)
y=df["PurchaseStatus"]

## Training ,Testing and scaling

In [5]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [6]:
scaler=ColumnTransformer(transformers=[('tnf1',StandardScaler(),['Age', 'Gender', 'AnnualIncome', 'NumberOfPurchases', 'ProductCategory',
                                     'TimeSpentOnWebsite', 'LoyaltyProgram', 'DiscountsAvailed']),
                                            ],
                                            remainder='passthrough')

In [7]:
X_train_trf = scaler.fit_transform(X_train)
X_test_trf=scaler.transform(X_test)

In [8]:
df.shape

(1500, 9)

## Building ANN model

In [9]:
model=Sequential()
model.add(Dense(32,activation='relu',input_dim=8))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(32,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(32,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                288       
                                                                 
 batch_normalization (Batch  (None, 32)                128       
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 batch_normalization_1 (Bat  (None, 32)                128       
 chNormalization)                                                
                                                                 
 dropout_1 (Dropout)         (None, 32)                0

In [11]:
model.compile(optimizer="adam",metrics=["accuracy"],loss="binary_crossentropy")

In [12]:
callbacks=EarlyStopping(monitor="val_loss",patience=5)

In [13]:
history=model.fit(X_train_trf,y_train,validation_split=0.2,epochs=100,batch_size=52,callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


 #### We are getting 90% accuracy but validation accuracy is only 84% showing that model is Overfitting

## Using Keras Tuner for best Hyperparameters

In [14]:
import keras_tuner as kt
import os
import shutil
tuner_dir = 'my_dir'
if os.path.exists(tuner_dir):
    shutil.rmtree(tuner_dir)

def build_model(hp):
    model = Sequential()
    num_layers = hp.Int("num_layers", min_value=1, max_value=10)
    
    for i in range(num_layers):
        if i == 0:
            model.add(Dense(
                units=hp.Int(f"units_{i}", min_value=16, max_value=128, step=8),
                activation=hp.Choice(f"activation_{i}", values=["relu", "tanh", "sigmoid"]),
                input_dim=8 
            ))
        else:
            model.add(Dense(
                units=hp.Int(f"units_{i}", min_value=16, max_value=128, step=8),
                activation=hp.Choice(f"activation_{i}", values=["relu", "tanh", "sigmoid"])
            ))
        model.add(BatchNormalization())
        model.add(Dropout(rate=hp.Float(f"dropout_{i}", min_value=0.0, max_value=0.9, step=0.1)))
    
    model.add(Dense(1, activation='sigmoid'))  
    model.compile(
        optimizer=hp.Choice("optimizer", values=["adam", "rmsprop", "sgd", "nadam", "adadelta"]),
        loss="binary_crossentropy",  
        metrics=["accuracy"]
    )
    return model


In [15]:
tuner = kt.RandomSearch(
    build_model,
    objective="val_accuracy",
    max_trials=3,
    directory=tuner_dir,
    project_name='my_project'
)

In [16]:
tuner.search(X_train_trf,y_train,epochs=5,validation_data=(X_test_trf,y_test))

Trial 3 Complete [00h 00m 03s]
val_accuracy: 0.8233333230018616

Best val_accuracy So Far: 0.8233333230018616
Total elapsed time: 00h 00m 13s


In [17]:
tuner.get_best_hyperparameters()[0].values

{'num_layers': 3,
 'units_0': 120,
 'activation_0': 'relu',
 'dropout_0': 0.30000000000000004,
 'optimizer': 'rmsprop',
 'units_1': 40,
 'activation_1': 'sigmoid',
 'dropout_1': 0.0,
 'units_2': 16,
 'activation_2': 'relu',
 'dropout_2': 0.5,
 'units_3': 120,
 'activation_3': 'relu',
 'dropout_3': 0.8,
 'units_4': 40,
 'activation_4': 'sigmoid',
 'dropout_4': 0.30000000000000004}

In [18]:
model=tuner.get_best_models(num_models=1)[0]

In [43]:
model.fit(X_train_trf,y_train,epochs=100,validation_split=0.2,callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


<keras.src.callbacks.History at 0x2600851a0b0>

## Training ML algorithms 

In [39]:
lr=LogisticRegression(penalty=None)
lr.fit(X_train_trf,y_train)
y_pred=lr.predict(X_test_trf)
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred)
recall=recall_score(y_test,y_pred)
f1=f1_score(y_test,y_pred)

print(f"accuracy:{accuracy}")
print(f"precison:{precision}")
print(f"recall:{recall}")
print(f"f1:{f1}")
print("Test accuracy:",np.mean(cross_val_score(lr,X_test_trf,y_test,scoring="accuracy",cv=5)))
print("Training accuracy:",np.mean(cross_val_score(lr,X_train_trf,y_train,scoring="accuracy",cv=5)))

accuracy:0.8066666666666666
precison:0.7844827586206896
recall:0.7338709677419355
f1:0.7583333333333333
Test accuracy: 0.79
Training accuracy: 0.8166666666666668


In [31]:
rf=RandomForestClassifier(n_estimators=300)
rf.fit(X_train_trf,y_train)
y_pred=rf.predict(X_test_trf)
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred)
recall=recall_score(y_test,y_pred)
f1=f1_score(y_test,y_pred)

print(f"accuracy:{accuracy}")
print(f"precison:{precision}")
print(f"recall:{recall}")
print(f"f1:{f1}")
print("Test accuracy:",np.mean(cross_val_score(rf,X_test_trf,y_test,scoring="accuracy",cv=5)))
print("Training accuracy:",np.mean(cross_val_score(rf,X_train_trf,y_train,scoring="accuracy",cv=5)))

accuracy:0.9333333333333333
precison:0.9482758620689655
recall:0.8870967741935484
f1:0.9166666666666666
Test accuracy: 0.85
Training accuracy: 0.9266666666666667


In [32]:
svm=SVC(C=0.5,kernel="rbf")
svm.fit(X_train_trf,y_train)
y_pred=svm.predict(X_test_trf)
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred)
recall=recall_score(y_test,y_pred)
f1=f1_score(y_test,y_pred)

print(f"accuracy:{accuracy}")
print(f"precison:{precision}")
print(f"recall:{recall}")
print(f"f1:{f1}")
print("Test accuracy:",np.mean(cross_val_score(svm,X_test_trf,y_test,scoring="accuracy",cv=5)))
print("Training accuracy:",np.mean(cross_val_score(svm,X_train_trf,y_train,scoring="accuracy",cv=5)))

accuracy:0.86
precison:0.8596491228070176
recall:0.7903225806451613
f1:0.8235294117647058
Test accuracy: 0.79
Training accuracy: 0.8558333333333333


In [None]:
# Creating Pipeline for app deployment on streamlit
pipeline = Pipeline([
    ('scaler', scaler),
    ('classifier', rf)
])

In [None]:
with open("Customer_Purchase_model2.pkl","wb") as file:
    pkl.dump(pipeline,file)