In [1]:
# tensorflow neural network deep learning
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# Check if TensorFlow can access GPU hardware acceleration
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found, using CPU for training.")

In [2]:
tf_df = pd.read_csv("./Resources/online_shoppers_intention.csv")
tf_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
tf_df.nunique()

Administrative               27
Administrative_Duration    3335
Informational                17
Informational_Duration     1258
ProductRelated              311
ProductRelated_Duration    9551
BounceRates                1872
ExitRates                  4777
PageValues                 2704
SpecialDay                    6
Month                        10
OperatingSystems              8
Browser                      13
Region                        9
TrafficType                  20
VisitorType                   3
Weekend                       2
Revenue                       2
dtype: int64

In [4]:
tf_df["TrafficType"].value_counts()
traffic_type_to_replace = list(tf_df['TrafficType'].value_counts()[tf_df['TrafficType'].value_counts() < 200].index)

# Replace in dataframe
for type in traffic_type_to_replace:
    tf_df['TrafficType'] = tf_df['TrafficType'].replace(type,"Other")

# Check to make sure replacement was successful
tf_df['TrafficType'].value_counts()

TrafficType
2        3913
1        2451
3        2052
4        1069
13        738
10        450
6         444
Other     363
8         343
5         260
11        247
Name: count, dtype: int64

In [5]:
tf_df['ProductRelated'].value_counts()[tf_df['ProductRelated'].value_counts() > 1]
product_to_replace = list(tf_df['ProductRelated'].value_counts()[tf_df['ProductRelated'].value_counts() < 200].index)

# Replace in dataframe
for type in product_to_replace:
    tf_df['ProductRelated'] = tf_df['ProductRelated'].replace(type,"Other")

# Check to make sure replacement was successful
tf_df['ProductRelated'].value_counts()

ProductRelated
Other    5422
1         622
2         465
3         458
4         404
6         396
7         391
5         382
8         370
10        330
9         317
12        313
11        308
13        289
15        270
16        260
14        251
17        226
20        225
19        218
22        213
18        200
Name: count, dtype: int64

In [6]:
# Convert categorical data to numeric with `pd.get_dummies`
dummy_df = pd.get_dummies(tf_df, columns=["ProductRelated", "Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType"], drop_first=True)
dummy_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Weekend,...,TrafficType_4,TrafficType_5,TrafficType_6,TrafficType_8,TrafficType_10,TrafficType_11,TrafficType_13,TrafficType_Other,VisitorType_Other,VisitorType_Returning_Visitor
0,0,0.0,0,0.0,0.0,0.2,0.2,0.0,0.0,False,...,False,False,False,False,False,False,False,False,False,True
1,0,0.0,0,0.0,64.0,0.0,0.1,0.0,0.0,False,...,False,False,False,False,False,False,False,False,False,True
2,0,0.0,0,0.0,0.0,0.2,0.2,0.0,0.0,False,...,False,False,False,False,False,False,False,False,False,True
3,0,0.0,0,0.0,2.666667,0.05,0.14,0.0,0.0,False,...,True,False,False,False,False,False,False,False,False,True
4,0,0.0,0,0.0,627.5,0.02,0.05,0.0,0.0,True,...,True,False,False,False,False,False,False,False,False,True


In [7]:
# Split our preprocessed data into our features and target arrays
X = dummy_df.drop(columns=["Revenue"]).values
y = dummy_df["Revenue"]
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

In [8]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
tf_model = tf.keras.models.Sequential()

In [10]:
tf_model.add(tf.keras.Input(shape=((X_train_scaled.shape[1],))))
tf_model.add(tf.keras.layers.Dense(64, activation='relu'))
tf_model.add(tf.keras.layers.Dense(32, activation='relu'))
tf_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
tf_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                5120      
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 7,233
Trainable params: 7,233
Non-trainable params: 0
_________________________________________________________________


In [11]:
tf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
tf_model.fit(X_train_scaled, y_train, epochs=200, batch_size=1024)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x275654cace0>

In [13]:
# Evaluate the model using the test data
model_loss, model_accuracy = tf_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

97/97 - 0s - loss: 0.5154 - accuracy: 0.8625 - 413ms/epoch - 4ms/step
Loss: 0.5154005885124207, Accuracy: 0.8624716401100159


In [14]:
# Classification report
from sklearn.metrics import classification_report
import numpy as np

# Get predicted probabilities and convert to class labels (0 or 1)
y_pred_probs = tf_model.predict(X_test_scaled)
y_pred = np.round(y_pred_probs).astype(int)  # Apply 0.5 threshold

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

       False       0.92      0.92      0.92      2606
        True       0.56      0.55      0.55       477

    accuracy                           0.86      3083
   macro avg       0.74      0.73      0.73      3083
weighted avg       0.86      0.86      0.86      3083

