!pip3 uninstall -y torch torchvision

!pip3 install torch torchvision

In [None]:
import torch
# import the data
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import RandomOverSampler

In [None]:
import os
os.environ["KERAS_BACKEND"] = "torch"

In [None]:
import numpy as np
import keras
from keras import layers

In [None]:
# function to evaluate model
# returns (accuracy, tpr, tnr, f1)
# takes as input real, predicted
def genResults(real, predicted):
    real=np.argmax(real, axis=1)
    predicted=np.argmax(predicted, axis=1)

    conf_matrix = confusion_matrix(y_true=real, y_pred=predicted)
    tn, fp, fn, tp = conf_matrix.ravel()

    f1 = (2*tp)/((2*tp)+fp+fn)

    acc = (tp + tn) / (tp + tn + fn + fp)

    tpr = tp / (tp + fp)
    tnr = tn / (tn + fn)

    print("true negative: ", tn)
    print("false positive: ", fp)
    print("false negative: ", fn)
    print("true positive: ", tp)
    
    return (acc, tpr, tnr, f1)

In [None]:
# split it like above

# do the shaping if necessary
# change the type of the columns that are characters to one-hot-encodings or something like that

# make a model with one layer
# run the model and see what happens

In [None]:
df = pd.read_csv('data\\online_shoppers_intention.csv')

print(df)

# hyperparameters
batch_size = 128
epochs = 30
dropOut = 0.5

In [None]:
df['total_duration'] = df['Administrative_Duration'] + df['Informational_Duration'] + df['ProductRelated_Duration']
df['proportion_of_duration'] = df['ProductRelated_Duration'] / df['total_duration']

In [None]:
df.drop(columns=['Informational_Duration', 'Administrative_Duration', 
                 'ProductRelated_Duration', 'Administrative', 'Informational', 
                 'ProductRelated', 'OperatingSystems', 'Browser'])

In [None]:
# df['Month'].replace(['
print(df['Month'].unique())

In [None]:
df['Month'].replace(['Jan','Feb','Mar','Apr','May','June','Jul','Aug','Sep','Oct','Nov','Dec'],
                    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], inplace=True)

In [None]:
print(df['Month'].unique())

In [None]:
print(df['VisitorType'].unique())

In [None]:
df['VisitorType'].replace(['Returning_Visitor', 'New_Visitor', 'Other'], [0, 1, 2], inplace=True)

In [None]:
df_copy = df.copy(deep=True)

In [None]:
df_copy['Weekend'].replace([False, True], [0,1], inplace=True)

In [None]:
df_copy['Revenue'].replace([False, True], [0,1], inplace=True)

In [None]:
df_copy = df_copy.drop(columns=['Informational_Duration', 'Administrative_Duration', 
                 'ProductRelated_Duration', 'Administrative', 'Informational', 
                 'ProductRelated', 'OperatingSystems', 'Browser'])

In [None]:
df_copy.fillna(1, inplace=True)

In [None]:
df_numpy = df_copy.to_numpy()

In [None]:

df_numpy.shape[0]
testSample = np.random.choice(df_numpy.shape[0], math.floor(0.7 * df_numpy.shape[0]), replace=False)

In [None]:
print(testSample)

In [None]:
print(df_numpy[testSample])

train = df_numpy[testSample]
test = df_numpy[~testSample]

print("Train length: " + str(train.shape[0]))
print("Test lenght: " + str(test.shape[0]))

In [None]:
df_copy_y = df_copy['Revenue']
df_copy_x = df_copy.loc[:, df_copy.columns != 'Revenue']

#print(df_copy_y)
#print(df_copy_x)

In [None]:
df_copy_x_train, df_copy_x_test, df_copy_y_train, df_copy_y_test = train_test_split(
    df_copy_x, df_copy_y, test_size=0.3, random_state=42)

In [None]:
numpy_x_train = df_copy_x_train.to_numpy()
numpy_x_test = df_copy_x_test.to_numpy()
numpy_y_train = df_copy_y_train.to_numpy()
numpy_y_test = df_copy_y_test.to_numpy()

num_classes = 2

input_shape = (11, 1)

# Make sure images have shape (28, 28, 1)
numpy_x_train = np.expand_dims(numpy_x_train, -1)
numpy_x_test = np.expand_dims(numpy_x_test, -1)
print("x_train shape:", numpy_x_train.shape)
print(numpy_x_train.shape[0], "train samples")
print(numpy_x_test.shape[0], "test samples")

# convert class vectors to binary class matrices
numpy_y_train = keras.utils.to_categorical(numpy_y_train, num_classes)
numpy_y_test = keras.utils.to_categorical(numpy_y_test, num_classes)

print("y_train shape: ", numpy_y_train.shape)

In [None]:
model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Dense(10, activation="relu"),
        layers.Flatten(),
        layers.Dropout(dropOut),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model.summary()

In [None]:
model.compile(loss="mean_squared_error", optimizer="adam", 
              metrics=["accuracy"])

model.fit(numpy_x_train, numpy_y_train, batch_size=batch_size, 
          epochs=epochs, validation_split=0.1)

In [None]:
predicted = model.predict(numpy_x_test)

In [None]:
print("numpy_y_test shape: ", numpy_y_test.shape)
print(numpy_y_test)
print("predicted: ", predicted.shape)
print(predicted)

In [None]:
acc, tpr, tnr, f1 = genResults(numpy_y_test, predicted)

print("10 layers:")
print("Accuracy: ", acc)
print("True-positive rate: ", tpr)
print("True-negative rate: ", tnr)
print("F1 Score: ", f1)

In [None]:
model20 = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Dense(20, activation="relu"),
        layers.Flatten(),
        layers.Dropout(dropOut),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model20.summary()

In [None]:
model20.compile(loss="mean_squared_error", optimizer="adam", 
              metrics=["accuracy"])

model20.fit(numpy_x_train, numpy_y_train, batch_size=batch_size, 
          epochs=epochs, validation_split=0.1)

In [None]:
predicted20 = model20.predict(numpy_x_test)

In [None]:
print("numpy_y_test shape: ", numpy_y_test.shape)
print(numpy_y_test)
print("predicted: ", predicted20.shape)
print(predicted)

In [None]:
acc20, tpr20, tnr20, f120 = genResults(numpy_y_test, predicted20)

print("20 layers:")
print("Accuracy: ", acc20)
print("True-positive rate: ", tpr20)
print("True-negative rate: ", tnr20)
print("F1 Score: ", f120)

In [None]:
model40 = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Dense(40, activation="relu"),
        layers.Flatten(),
        layers.Dropout(dropOut),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model40.summary()

In [None]:
model40.compile(loss="mean_squared_error", optimizer="adam", 
              metrics=["accuracy"])

model40.fit(numpy_x_train, numpy_y_train, batch_size=batch_size, 
          epochs=epochs, validation_split=0.1)

In [None]:
predicted40 = model40.predict(numpy_x_test)

In [None]:
acc40, tpr40, tnr40, f140 = genResults(numpy_y_test, predicted40)

print("40 layers:")
print("Accuracy: ", acc40)
print("True-positive rate: ", tpr40)
print("True-negative rate: ", tnr40)
print("F1 Score: ", f140)

In [None]:
# now we try oversampling
ros = RandomOverSampler(random_state=42)
df_copy_x_resampled, df_copy_y_resampled = ros.fit_resample(df_copy_x, df_copy_y)

df_copy_x_resampled_train, df_copy_x__resampled_test, df_copy_y_resampled_train, df_copy_y_resampled_test = train_test_split(
    df_copy_x_resampled, df_copy_y_resampled, test_size=0.3, random_state=42)

In [None]:
numpy_x_resampled_train = df_copy_x_resampled_train.to_numpy()
numpy_x_resampled_test = df_copy_x__resampled_test.to_numpy()
numpy_y_resampled_train = df_copy_y_resampled_train.to_numpy()
numpy_y_resampled_test = df_copy_y_resampled_test.to_numpy()

num_classes = 2

input_shape = (11, 1)

# Make sure images have shape (28, 28, 1)
numpy_x_resampled_train = np.expand_dims(numpy_x_resampled_train, -1)
numpy_x_resampled_test = np.expand_dims(numpy_x_resampled_test, -1)
print("x_train shape:", numpy_x_resampled_train.shape)
print(numpy_x_resampled_train.shape[0], "train samples")
print(numpy_x_resampled_test.shape[0], "test samples")

# convert class vectors to binary class matrices
numpy_y_resampled_train = keras.utils.to_categorical(numpy_y_resampled_train, num_classes)
numpy_y_resampled_test = keras.utils.to_categorical(numpy_y_resampled_test, num_classes)

print("y_train shape: ", numpy_y_resampled_train.shape)

In [None]:
model_r = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Dense(10, activation="relu"),
        layers.Flatten(),
        layers.Dropout(dropOut),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model_r.summary()

In [None]:
model_r.compile(loss="mean_squared_error", optimizer="adam", 
              metrics=["accuracy"])

model_r.fit(numpy_x_resampled_train, numpy_y_resampled_train, batch_size=batch_size, 
          epochs=epochs, validation_split=0.1)

In [None]:
predicted_r = model_r.predict(numpy_x_resampled_test)

In [None]:
acc_r, tpr_r, tnr_r, f1_r = genResults(numpy_y_resampled_test, predicted_r)

print("10 layers:")
print("Accuracy: ", acc_r)
print("True-positive rate: ", tpr_r)
print("True-negative rate: ", tnr_r)
print("F1 Score: ", f1_r)

In [None]:
model_r_20 = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Dense(20, activation="relu"),
        layers.Flatten(),
        layers.Dropout(dropOut),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model_r_20.summary()

In [None]:
model_r_20.compile(loss="mean_squared_error", optimizer="adam", 
              metrics=["accuracy"])

model_r_20.fit(numpy_x_resampled_train, numpy_y_resampled_train, batch_size=batch_size, 
          epochs=epochs, validation_split=0.1)

In [None]:
predicted_r_20 = model_r_20.predict(numpy_x_resampled_test)

In [None]:
acc_r_20, tpr_r_20, tnr_r_20, f1_r_20 = genResults(numpy_y_resampled_test, predicted_r_20)

print("20 layers:")
print("Accuracy: ", acc_r_20)
print("True-positive rate: ", tpr_r_20)
print("True-negative rate: ", tnr_r_20)
print("F1 Score: ", f1_r_20)

In [None]:
model_r_40 = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Dense(40, activation="relu"),
        layers.Flatten(),
        layers.Dropout(dropOut),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model_r_40.summary()

In [None]:
model_r_40.compile(loss="mean_squared_error", optimizer="adam", 
              metrics=["accuracy"])

model_r_40.fit(numpy_x_resampled_train, numpy_y_resampled_train, batch_size=batch_size, 
          epochs=epochs, validation_split=0.1)

In [None]:
predicted_r_40 = model_r_40.predict(numpy_x_resampled_test)

In [None]:
acc_r_40, tpr_r_40, tnr_r_40, f1_r_40 = genResults(numpy_y_resampled_test, predicted_r_40)

print("40 layers:")
print("Accuracy: ", acc_r_40)
print("True-positive rate: ", tpr_r_40)
print("True-negative rate: ", tnr_r_40)
print("F1 Score: ", f1_r_40)

In [None]:
# Now we need to repeat it all - starting with 20 nodes I think, but with a much smaller fraction
small_x_resampled_train, small_x_resampled_test, small_y_resampled_train, small_y_resampled_test = train_test_split(
    df_copy_x_resampled, df_copy_y_resampled, test_size=0.8, random_state=42)

numpy_small_x_train = small_x_resampled_train.to_numpy()
numpy_small_x_test = small_x_resampled_test.to_numpy()
numpy_small_y_train = small_y_resampled_train.to_numpy()
numpy_small_y_test = small_y_resampled_test.to_numpy()

num_classes = 2

input_shape = (11, 1)

# Make sure images have shape (11, 1)
numpy_small_x_train = np.expand_dims(numpy_small_x_train, -1)
numpy_small_x_test = np.expand_dims(numpy_small_x_test, -1)
print("x_train shape:", numpy_small_x_train.shape)
print(numpy_small_x_train.shape[0], "train samples")
print(numpy_small_x_test.shape[0], "test samples")

# convert class vectors to binary class matrices
numpy_small_y_train = keras.utils.to_categorical(numpy_small_y_train, num_classes)
numpy_small_y_test = keras.utils.to_categorical(numpy_small_y_test, num_classes)

print("y_train shape: ", numpy_small_y_train.shape)

In [None]:
model_small_r_20 = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Dense(20, activation="relu"),
        layers.Flatten(),
        layers.Dropout(dropOut),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model_small_r_20.summary()

In [None]:
model_small_r_20.compile(loss="mean_squared_error", optimizer="adam", 
              metrics=["accuracy"])

model_small_r_20.fit(numpy_small_x_train, numpy_small_y_train, batch_size=batch_size, 
          epochs=epochs, validation_split=0.1)

In [None]:
predicted_r_small_20 = model_small_r_20.predict(numpy_small_x_test)

In [None]:
print(small_y_resampled_test.shape)
print(predicted_r_small_20.shape)

In [None]:
acc_r_small_20, tpr_r_small_20, tnr_r_small_20, f1_r_small_20 = genResults(numpy_small_y_test, predicted_r_small_20)

print("Small resampled 20 layers:")
print("Accuracy: ", acc_r_small_20)
print("True-positive rate: ", tpr_r_small_20)
print("True-negative rate: ", tnr_r_small_20)
print("F1 Score: ", f1_r_small_20)