In [1]:
import csv
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

In [2]:
## 2. Load Data
def load_data(filename):
    """
    Load shopping data from a CSV file and convert it into evidence and labels.
    """
    evidence = []
    labels = []

    month_mapping = {
        "Jan": 0, "Feb": 1, "Mar": 2, "Apr": 3, "May": 4, "June": 5,
        "Jul": 6, "Aug": 7, "Sep": 8, "Oct": 9, "Nov": 10, "Dec": 11
    }

    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            evidence.append([
                int(row["Administrative"]),
                float(row["Administrative_Duration"]),
                int(row["Informational"]),
                float(row["Informational_Duration"]),
                int(row["ProductRelated"]),
                float(row["ProductRelated_Duration"]),
                float(row["BounceRates"]),
                float(row["ExitRates"]),
                float(row["PageValues"]),
                float(row["SpecialDay"]),
                month_mapping[row["Month"]],
                int(row["OperatingSystems"]),
                int(row["Browser"]),
                int(row["Region"]),
                int(row["TrafficType"]),
                1 if row["VisitorType"] == "Returning_Visitor" else 0,
                1 if row["Weekend"] == "TRUE" else 0
            ])
            labels.append(1 if row["Revenue"] == "TRUE" else 0)

    return evidence, labels

# Load data
evidence, labels = load_data("shopping.csv")
print(f"Loaded {len(evidence)} data points.")

Loaded 12330 data points.


In [3]:
## 3. Split Data
# Split data into training and testing sets
TEST_SIZE = 0.4

X_train, X_test, y_train, y_test = train_test_split(
    evidence, labels, test_size=TEST_SIZE
)

print(f"Training data points: {len(X_train)}")
print(f"Testing data points: {len(X_test)}")

Training data points: 7398
Testing data points: 4932


In [4]:
## 4. Train the Model
def train_model(evidence, labels):
    """
    Train a k-nearest neighbors model (k=1) on the given evidence and labels.
    """
    model = KNeighborsClassifier(n_neighbors=1)
    model.fit(evidence, labels)
    return model

# Train model
model = train_model(X_train, y_train)

In [5]:
## 5. Evaluate the Model

def evaluate(labels, predictions):
    """
    Evaluate sensitivity (true positive rate) and specificity (true negative rate).
    """
    true_positive = sum(1 for actual, predicted in zip(labels, predictions) if actual == 1 and predicted == 1)
    true_negative = sum(1 for actual, predicted in zip(labels, predictions) if actual == 0 and predicted == 0)
    
    total_positive = sum(1 for label in labels if label == 1)
    total_negative = sum(1 for label in labels if label == 0)

    sensitivity = true_positive / total_positive if total_positive else 0
    specificity = true_negative / total_negative if total_negative else 0

    return sensitivity, specificity

# Make predictions
predictions = model.predict(X_test)
sensitivity, specificity = evaluate(y_test, predictions)

print(f"Sensitivity (True Positive Rate): {sensitivity:.2f}")
print(f"Specificity (True Negative Rate): {specificity:.2f}")

Sensitivity (True Positive Rate): 0.37
Specificity (True Negative Rate): 0.91


In [6]:
## 6. Report Results

correct = (np.array(y_test) == predictions).sum()
incorrect = (np.array(y_test) != predictions).sum()

print(f"Correct: {correct}")
print(f"Incorrect: {incorrect}")
print(f"Accuracy: {correct / len(y_test):.2f}")

Correct: 4100
Incorrect: 832
Accuracy: 0.83
