In [84]:
# Import findspark
import findspark
findspark.init()

In [85]:
# Start Spark session
from pyspark.sql import SparkSession
from sklearn.decomposition import PCA
spark = SparkSession.builder.appName("knnLearning").getOrCreate()

In [86]:

from pyspark import SparkFiles
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
file = "./Resources/online_shoppers_intention.csv"
spark.sparkContext.addFile(file)
df = spark.read.csv(SparkFiles.get("online_shoppers_intention.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|      VisitorType|Weekend|Revenue|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------+
|             0|                      0|            0|                     0|             1|                      0|        0.2|        0.2|         0|         0|  Feb|               1|      1|     1|          1|Returning_Visitor|  FALSE|  FALSE|
|           

In [87]:
df.columns

['Administrative',
 'Administrative_Duration',
 'Informational',
 'Informational_Duration',
 'ProductRelated',
 'ProductRelated_Duration',
 'BounceRates',
 'ExitRates',
 'PageValues',
 'SpecialDay',
 'Month',
 'OperatingSystems',
 'Browser',
 'Region',
 'TrafficType',
 'VisitorType',
 'Weekend',
 'Revenue']

In [88]:
df.createOrReplaceTempView("online_shoppers_intention")

In [89]:
spark.sql(
"""
SELECT 
* 
FROM 
online_shoppers_intention 
LIMIT 10
""").show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|      VisitorType|Weekend|Revenue|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------+
|             0|                      0|            0|                     0|             1|                      0|        0.2|        0.2|         0|         0|  Feb|               1|      1|     1|          1|Returning_Visitor|  FALSE|  FALSE|
|           

In [90]:
df = df.toPandas()
y = df["Revenue"]
X = df.drop(columns=["Revenue"])

In [91]:
X[:5]

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0,0,0,1,0.0,0.2,0.2,0,0,Feb,1,1,1,1,Returning_Visitor,False
1,0,0,0,0,2,64.0,0.0,0.1,0,0,Feb,2,2,1,2,Returning_Visitor,False
2,0,0,0,0,1,0.0,0.2,0.2,0,0,Feb,4,1,9,3,Returning_Visitor,False
3,0,0,0,0,2,2.666666667,0.05,0.14,0,0,Feb,3,2,2,4,Returning_Visitor,False
4,0,0,0,0,10,627.5,0.02,0.05,0,0,Feb,3,3,1,4,Returning_Visitor,True


In [92]:
y[:5]

0    FALSE
1    FALSE
2    FALSE
3    FALSE
4    FALSE
Name: Revenue, dtype: object

In [93]:
pca_cols = [col for col in X.columns if col not in ["ProductRelated", "ProductRelated_Duration",
                                                     "SpecialDay", "Month",	"OperatingSystems",	"Browser", "Region",	
                                                     "TrafficType",	"VisitorType", "Weekend"
                                                     ]]
pca_df = X[pca_cols]
scaler = StandardScaler()
pca_scaled = scaler.fit_transform(pca_df)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(pca_scaled)
pca.explained_variance_ratio_

array([0.37119637, 0.22898014])

In [94]:
pca_cols = [f'PC{i+1}' for i in range(X_pca.shape[1])]
df_pca = pd.DataFrame(X_pca, columns=pca_cols, index=df.index)

In [95]:
cols_to_keep = ["ProductRelated", "ProductRelated_Duration",
                "SpecialDay", "Month",	"OperatingSystems",	"Browser", "Region",	
                "TrafficType",	"VisitorType", "Weekend"
              ] 
X = pd.concat([df_pca, X[cols_to_keep]], axis=1)

In [96]:
X = pd.get_dummies(X, drop_first=True)

In [97]:
X.head(5)

Unnamed: 0,PC1,PC2,ProductRelated_1,ProductRelated_10,ProductRelated_100,ProductRelated_101,ProductRelated_102,ProductRelated_103,ProductRelated_104,ProductRelated_105,...,TrafficType_3,TrafficType_4,TrafficType_5,TrafficType_6,TrafficType_7,TrafficType_8,TrafficType_9,VisitorType_Other,VisitorType_Returning_Visitor,Weekend_TRUE
0,3.715813,3.166276,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,1.108562,-0.105538,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,3.715813,3.166276,True,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
3,1.89892,0.868236,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,True,False
4,0.815107,-0.401637,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,True,True


In [98]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [99]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [100]:
# Train KNN with custom weights
model = KNeighborsClassifier(n_neighbors=3, weights="distance")


In [101]:
# Train the model
model.fit(X_train_scaled, y_train)

In [102]:
# Create predictions
y_pred = model.predict(X_test_scaled)

# Review the predictions
y_pred

array(['FALSE', 'FALSE', 'FALSE', ..., 'FALSE', 'FALSE', 'FALSE'],
      dtype=object)

In [103]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

array([[2046,  379],
       [  38,    3]], dtype=int64)

In [104]:
# Print classification report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       FALSE       0.98      0.84      0.91      2425
        TRUE       0.01      0.07      0.01        41

    accuracy                           0.83      2466
   macro avg       0.49      0.46      0.46      2466
weighted avg       0.97      0.83      0.89      2466

