In [168]:
# Import findspark
import findspark
findspark.init()

In [169]:
# Start Spark session
from pyspark.sql import SparkSession
from sklearn.decomposition import PCA
spark = SparkSession.builder.appName("knnLearning").getOrCreate()

In [170]:

from pyspark import SparkFiles
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
file = "./Resources/online_shoppers_intention.csv"
spark.sparkContext.addFile(file)
df = spark.read.csv(SparkFiles.get("online_shoppers_intention.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|      VisitorType|Weekend|Revenue|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------+
|             0|                      0|            0|                     0|             1|                      0|        0.2|        0.2|         0|         0|  Feb|               1|      1|     1|          1|Returning_Visitor|  FALSE|  FALSE|
|           

In [171]:
df.columns

['Administrative',
 'Administrative_Duration',
 'Informational',
 'Informational_Duration',
 'ProductRelated',
 'ProductRelated_Duration',
 'BounceRates',
 'ExitRates',
 'PageValues',
 'SpecialDay',
 'Month',
 'OperatingSystems',
 'Browser',
 'Region',
 'TrafficType',
 'VisitorType',
 'Weekend',
 'Revenue']

In [172]:
df.createOrReplaceTempView("online_shoppers_intention")

In [173]:
spark.sql(
"""
SELECT 
* 
FROM 
online_shoppers_intention 
LIMIT 10
""").show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|      VisitorType|Weekend|Revenue|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------+
|             0|                      0|            0|                     0|             1|                      0|        0.2|        0.2|         0|         0|  Feb|               1|      1|     1|          1|Returning_Visitor|  FALSE|  FALSE|
|           

In [174]:
df = df.toPandas()
y = df["Revenue"]
X = df.drop(columns=["Revenue"])

In [175]:
X[:5]

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0,0,1,0.0,0.2,0.2,0,0,Feb,1,1,1,1,Returning_Visitor,False
1,0,0,0,0,2,64.0,0.0,0.1,0,0,Feb,2,2,1,2,Returning_Visitor,False
2,0,0,0,0,1,0.0,0.2,0.2,0,0,Feb,4,1,9,3,Returning_Visitor,False
3,0,0,0,0,2,2.666666667,0.05,0.14,0,0,Feb,3,2,2,4,Returning_Visitor,False
4,0,0,0,0,10,627.5,0.02,0.05,0,0,Feb,3,3,1,4,Returning_Visitor,True


In [176]:
y[:5]

0    FALSE
1    FALSE
2    FALSE
3    FALSE
4    FALSE
Name: Revenue, dtype: object

In [177]:
pca_cols = [col for col in X.columns if col not in ["ProductRelated", "ProductRelated_Duration",
                                                     "SpecialDay", "Month",	"OperatingSystems",	"Browser", "Region",	
                                                     "TrafficType",	"VisitorType", "Weekend"
                                                     ]]
pca_df = X[pca_cols]
scaler = StandardScaler()
pca_scaled = scaler.fit_transform(pca_df)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(pca_scaled)
pca.explained_variance_ratio_

array([0.37119637, 0.22898014])

In [178]:
pca_cols = [f'PC{i+1}' for i in range(X_pca.shape[1])]
df_pca = pd.DataFrame(X_pca, columns=pca_cols, index=df.index)

In [179]:
cols_to_keep = ["ProductRelated", "ProductRelated_Duration",
                "SpecialDay", "Month",	"OperatingSystems",	"Browser", "Region",	
                "TrafficType",	"VisitorType", "Weekend"
              ] 
X = pd.concat([df_pca, X[cols_to_keep]], axis=1)

In [180]:
X = pd.get_dummies(X, drop_first=True)

In [181]:
X.head(5)

Unnamed: 0,PC1,PC2,ProductRelated_1,ProductRelated_10,ProductRelated_100,ProductRelated_101,ProductRelated_102,ProductRelated_103,ProductRelated_104,ProductRelated_105,ProductRelated_106,ProductRelated_107,ProductRelated_108,ProductRelated_109,ProductRelated_11,ProductRelated_110,ProductRelated_111,ProductRelated_112,ProductRelated_113,ProductRelated_114,ProductRelated_115,ProductRelated_116,ProductRelated_117,ProductRelated_118,ProductRelated_119,ProductRelated_12,ProductRelated_120,ProductRelated_121,ProductRelated_122,ProductRelated_123,ProductRelated_124,ProductRelated_125,ProductRelated_126,ProductRelated_127,ProductRelated_128,ProductRelated_129,ProductRelated_13,ProductRelated_130,ProductRelated_131,ProductRelated_132,...,Browser_12,Browser_13,Browser_2,Browser_3,Browser_4,Browser_5,Browser_6,Browser_7,Browser_8,Browser_9,Region_2,Region_3,Region_4,Region_5,Region_6,Region_7,Region_8,Region_9,TrafficType_10,TrafficType_11,TrafficType_12,TrafficType_13,TrafficType_14,TrafficType_15,TrafficType_16,TrafficType_17,TrafficType_18,TrafficType_19,TrafficType_2,TrafficType_20,TrafficType_3,TrafficType_4,TrafficType_5,TrafficType_6,TrafficType_7,TrafficType_8,TrafficType_9,VisitorType_Other,VisitorType_Returning_Visitor,Weekend_TRUE
0,3.715813,3.166276,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
1,1.108562,-0.105538,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False
2,3.715813,3.166276,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False
3,1.89892,0.868236,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False
4,0.815107,-0.401637,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,True


In [182]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [183]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [184]:
# Train KNN with custom weights
model = KNeighborsClassifier(n_neighbors=3, weights="distance")


In [185]:
# Train the model
model.fit(X_train_scaled, y_train)

In [186]:
# Create predictions
y_pred = model.predict(X_test_scaled)

# Review the predictions
y_pred

array(['FALSE', 'FALSE', 'FALSE', ..., 'FALSE', 'FALSE', 'FALSE'],
      dtype=object)

In [187]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

array([[2046,  379],
       [  38,    3]], dtype=int64)

In [188]:
# Print classification report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       FALSE       0.98      0.84      0.91      2425
        TRUE       0.01      0.07      0.01        41

    accuracy                           0.83      2466
   macro avg       0.49      0.46      0.46      2466
weighted avg       0.97      0.83      0.89      2466



In [189]:
# tensorflow neural network deep learning
from sklearn.metrics import accuracy_score
import tensorflow as tf

# Check if TensorFlow can access GPU hardware acceleration
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found, using CPU for training.")

In [190]:
tf_df = pd.read_csv("./Resources/online_shoppers_intention.csv")
tf_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [191]:
# tf_df.drop(columns=["Administrative", "Administrative_Duration", "Informational", "Informational_Duration",
#                     "BounceRates", "ExitRates", "PageValues"], inplace=True)

In [192]:
tf_df.nunique()

Administrative               27
Administrative_Duration    3335
Informational                17
Informational_Duration     1258
ProductRelated              311
ProductRelated_Duration    9551
BounceRates                1872
ExitRates                  4777
PageValues                 2704
SpecialDay                    6
Month                        10
OperatingSystems              8
Browser                      13
Region                        9
TrafficType                  20
VisitorType                   3
Weekend                       2
Revenue                       2
dtype: int64

In [193]:
tf_df["TrafficType"].value_counts()
traffic_type_to_replace = list(tf_df['TrafficType'].value_counts()[tf_df['TrafficType'].value_counts() < 200].index)

# Replace in dataframe
for type in traffic_type_to_replace:
    tf_df['TrafficType'] = tf_df['TrafficType'].replace(type,"Other")

# Check to make sure replacement was successful
tf_df['TrafficType'].value_counts()

TrafficType
2        3913
1        2451
3        2052
4        1069
13        738
10        450
6         444
Other     363
8         343
5         260
11        247
Name: count, dtype: int64

In [194]:
tf_df['ProductRelated'].value_counts()[tf_df['ProductRelated'].value_counts() > 1]
product_to_replace = list(tf_df['ProductRelated'].value_counts()[tf_df['ProductRelated'].value_counts() < 200].index)

# Replace in dataframe
for type in product_to_replace:
    tf_df['ProductRelated'] = tf_df['ProductRelated'].replace(type,"Other")

# Check to make sure replacement was successful
tf_df['ProductRelated'].value_counts()

ProductRelated
Other    5422
1         622
2         465
3         458
4         404
6         396
7         391
5         382
8         370
10        330
9         317
12        313
11        308
13        289
15        270
16        260
14        251
17        226
20        225
19        218
22        213
18        200
Name: count, dtype: int64

In [195]:
# Convert categorical data to numeric with `pd.get_dummies`
dummy_df = pd.get_dummies(tf_df, columns=["ProductRelated", "Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType"], drop_first=True)
dummy_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Weekend,Revenue,ProductRelated_2,ProductRelated_3,ProductRelated_4,ProductRelated_5,ProductRelated_6,ProductRelated_7,ProductRelated_8,ProductRelated_9,ProductRelated_10,ProductRelated_11,ProductRelated_12,ProductRelated_13,ProductRelated_14,ProductRelated_15,ProductRelated_16,ProductRelated_17,ProductRelated_18,ProductRelated_19,ProductRelated_20,ProductRelated_22,ProductRelated_Other,Month_Dec,Month_Feb,Month_Jul,Month_June,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,OperatingSystems_2,OperatingSystems_3,OperatingSystems_4,OperatingSystems_5,OperatingSystems_6,OperatingSystems_7,OperatingSystems_8,Browser_2,Browser_3,Browser_4,Browser_5,Browser_6,Browser_7,Browser_8,Browser_9,Browser_10,Browser_11,Browser_12,Browser_13,Region_2,Region_3,Region_4,Region_5,Region_6,Region_7,Region_8,Region_9,TrafficType_2,TrafficType_3,TrafficType_4,TrafficType_5,TrafficType_6,TrafficType_8,TrafficType_10,TrafficType_11,TrafficType_13,TrafficType_Other,VisitorType_Other,VisitorType_Returning_Visitor
0,0,0.0,0,0.0,0.0,0.2,0.2,0.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
1,0,0.0,0,0.0,64.0,0.0,0.1,0.0,0.0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True
2,0,0.0,0,0.0,0.0,0.2,0.2,0.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,True
3,0,0.0,0,0.0,2.666667,0.05,0.14,0.0,0.0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True
4,0,0.0,0,0.0,627.5,0.02,0.05,0.0,0.0,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True


In [196]:
# Split our preprocessed data into our features and target arrays
X = dummy_df.drop(columns=["Revenue"]).values
y = dummy_df["Revenue"]
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

In [197]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [198]:
tf_model = tf.keras.models.Sequential()

In [199]:
tf_model.add(tf.keras.Input(shape=((X_train_scaled.shape[1],))))
tf_model.add(tf.keras.layers.Dense(64, activation='relu'))
tf_model.add(tf.keras.layers.Dense(32, activation='relu'))
tf_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
tf_model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_13 (Dense)            (None, 64)                5120      
                                                                 
 dense_14 (Dense)            (None, 32)                2080      
                                                                 
 dense_15 (Dense)            (None, 1)                 33        
                                                                 
Total params: 7,233
Trainable params: 7,233
Non-trainable params: 0
_________________________________________________________________


In [200]:
tf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [201]:
tf_model.fit(X_train_scaled, y_train, epochs=200, batch_size=1024)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x1bf4ec56e90>

In [202]:
# Evaluate the model using the test data
model_loss, model_accuracy = tf_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

97/97 - 0s - loss: 0.4528 - accuracy: 0.8667 - 352ms/epoch - 4ms/step
Loss: 0.45277291536331177, Accuracy: 0.8666883111000061


In [None]:
# Classification report
from sklearn.metrics import classification_report
import numpy as np

# Get predicted probabilities and convert to class labels (0 or 1)
y_pred_probs = tf_model.predict(X_test_scaled)
y_pred = np.round(y_pred_probs).astype(int)  # Apply 0.5 threshold

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

       False       0.91      0.94      0.92      2606
        True       0.59      0.47      0.52       477

    accuracy                           0.87      3083
   macro avg       0.75      0.70      0.72      3083
weighted avg       0.86      0.87      0.86      3083

