In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('shopping.csv')

df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
df.VisitorType.value_counts()

Returning_Visitor    10551
New_Visitor           1694
Other                   85
Name: VisitorType, dtype: int64

In [4]:
# get labels and convert to list
labels = df.pop('Revenue').map(lambda x: 1 if x else 0).values.tolist()

In [5]:
type(labels)

list

In [6]:
df.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
dtype: object

In [7]:
# Weekend, an integer 0 (if false) or 1 (if true)
df.Weekend = df.Weekend.map(lambda x: 1 if x else 0)

# VisitorType, an integer 0 (not returning) or 1 (returning)
df.VisitorType = df.VisitorType.map(lambda x: 0 if x == 'New_Visitor' or x == 'Other' else 1)

In [8]:
df.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                  int64
Weekend                      int64
dtype: object

In [9]:
df.VisitorType.value_counts()

1    10551
0     1779
Name: VisitorType, dtype: int64

In [10]:
df.Month.value_counts()

May     3364
Nov     2998
Mar     1907
Dec     1727
Oct      549
Sep      448
Aug      433
Jul      432
June     288
Feb      184
Name: Month, dtype: int64

In [11]:
# Month, an index from 0 (January) to 11 (December)
months = {
    "Jan"   : 0,
    "Feb"   : 1,
    "Mar"   : 2,
    "Apr"   : 3,
    "May"   : 4,
    "June"  : 5,
    "Jul"   : 6,
    "Aug"   : 7,
    "Sep"   : 8,
    "Oct"   : 9,
    "Nov"   : 10,
    "Dec"   : 11
}

In [12]:
df.Month = df.Month.replace(months).astype('int64')
#df.Month = df.Month.astype('int64')

In [13]:
df.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                        int64
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                  int64
Weekend                      int64
dtype: object

In [14]:
df.Month.value_counts()

4     3364
10    2998
2     1907
11    1727
9      549
8      448
7      433
6      432
5      288
1      184
Name: Month, dtype: int64

In [15]:
# convert df values to list, preserving original datatypes
evidences = list(map(list, df.itertuples(index=False)))

In [16]:
evidences[0], labels[0]

([0, 0.0, 0, 0.0, 1, 0.0, 0.2, 0.2, 0.0, 0.0, 1, 1, 1, 1, 1, 1, 0], 0)

In [17]:
def load_data(filename):
    """
    Load shopping data from a CSV file `filename` and convert into a list of
    evidence lists and a list of labels. Return a tuple (evidence, labels).

    evidence should be a list of lists, where each list contains the
    following values, in order:
        - Administrative, an integer
        - Administrative_Duration, a floating point number
        - Informational, an integer
        - Informational_Duration, a floating point number
        - ProductRelated, an integer
        - ProductRelated_Duration, a floating point number
        - BounceRates, a floating point number
        - ExitRates, a floating point number
        - PageValues, a floating point number
        - SpecialDay, a floating point number
        - Month, an index from 0 (January) to 11 (December)
        - OperatingSystems, an integer
        - Browser, an integer
        - Region, an integer
        - TrafficType, an integer
        - VisitorType, an integer 0 (not returning) or 1 (returning)
        - Weekend, an integer 0 (if false) or 1 (if true)

    labels should be the corresponding list of labels, where each label
    is 1 if Revenue is true, and 0 otherwise.
    """

    # load dataframe from .csv file
    df = pd.read_csv(filename)

    # create list of labels
    labels = df.pop('Revenue').map(lambda x: 1 if x else 0).to_numpy().tolist()

    # Weekend, an integer 0 (if false) or 1 (if true)
    df.Weekend = df.Weekend.map(lambda x: 1 if x else 0)

    # VisitorType, an integer 0 (not returning) or 1 (returning)
    df.VisitorType = df.VisitorType.map(lambda x: 0 if x == 'New_Visitor' or x == 'Other' else 1)

    # Month, an index from 0 (January) to 11 (December)
    months = {
        "Jan"   : 0,
        "Feb"   : 1,
        "Mar"   : 2,
        "Apr"   : 3,
        "May"   : 4,
        "June"  : 5,
        "Jul"   : 6,
        "Aug"   : 7,
        "Sep"   : 8,
        "Oct"   : 9,
        "Nov"   : 10,
        "Dec"   : 11
    }

    df.Month = df.Month.replace(months).astype('int64')

    # convert df values to list, preserving original datatypes
    evidence = list(map(list, df.itertuples(index=False)))

    # return evidences, labels
    return evidence, labels

In [18]:
evidence, labels = load_data('shopping.csv')

In [19]:
print(f"Evidence: {evidence[0]}\nLabel: {labels[0]}")

Evidence: [0, 0.0, 0, 0.0, 1, 0.0, 0.2, 0.2, 0.0, 0.0, 1, 1, 1, 1, 1, 1, 0]
Label: 0


In [20]:
from sklearn.neighbors import KNeighborsClassifier

In [21]:
def train_model(evidence, labels):
    """
    Given a list of evidence lists and a list of labels, return a
    fitted k-nearest neighbor model (k=1) trained on the data.
    """
    return KNeighborsClassifier(n_neighbors=1).fit(evidence, labels)

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
        evidence, labels, test_size=0.4
    )

In [24]:
type(y_test)

list

In [25]:
model = train_model(X_train, y_train)

In [26]:
predictions = model.predict(X_test)

In [27]:
def evaluate(labels, predictions):
    """
    Given a list of actual labels and a list of predicted labels,
    return a tuple (sensitivity, specificity).

    Assume each label is either a 1 (positive) or 0 (negative).

    `sensitivity` should be a floating-point value from 0 to 1
    representing the "true positive rate": the proportion of
    actual positive labels that were accurately identified.

    `specificity` should be a floating-point value from 0 to 1
    representing the "true negative rate": the proportion of
    actual negative labels that were accurately identified.
    """
    
    # All positive labels
    actual_positives = [label for label in labels if label == 1]
    # All correct positive predictions
    true_positives = [1 for label, prediction in zip(labels, predictions) if label == 1 and prediction == 1]


    #print("Total positives: ", len(actual_positives))
    #print("Correct Positives: ", sum(true_positives))
    #print("TPR: ", sum(true_positives) / len(actual_positives))

    # all negative labels
    actual_negatives = [label for label in labels if label == 0]
    # all correct negative predictions
    true_negatives = [1 for label, prediction in zip(labels, predictions) if label == 0 and prediction == 0]


    #print("Total negatives: ", len(actual_negatives))
    #print("Correct negatives: ", sum(true_negatives))
    #print("TNR: ", sum(true_negatives) / len(actual_negatives))

    # Calculate sensitivity
    sensitivity = sum(true_positives) / len(actual_positives)
    # Calculate specificity
    specificity = sum(true_negatives) / len(actual_negatives)

    return sensitivity, specificity

In [28]:
evaluate(y_test, predictions)

(0.4018087855297158, 0.9064454064454065)