In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

import random

from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

from numpy import mean, std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from env import *

In [2]:
pd.set_option('display.max_columns', None)

# Loading

In [3]:
df = pd.read_json(DATASET_ONE_PV_PER_ONE_ROW, orient ='records')

In [4]:
df[:5]

Unnamed: 0,Session,indicator_fg,Page,Likert_value,Big5,time,duration,distance,x_axis_distance,y_axis_distance,real_ideal_trajectory_diff,max_deviation,velocity,x_axis_vel,y_axis_vel,acc,x_axis_acc,y_axis_acc,auc_diff,clicks,x_flips,y_flips,visits,scrolling,init_time,react_time
0,gsd4FIhsnwaOIMOtoSqX4geVy,1,3,4,1,14236,14893,4099.440429,3635.15625,1076.0,3434.874661,539.568132,0.287963,0.25535,0.075583,2e-05,1.8e-05,5e-06,50776.6875,2,8,8,1,False,261,7861
1,gsd4FIhsnwaOIMOtoSqX4geVy,1,4,5,1,2448,4529,1298.103346,1235.4375,354.0,456.372879,123.64276,0.530271,0.504672,0.144608,0.000217,0.000206,5.9e-05,161737.125,2,2,1,1,False,470,2520
2,gsd4FIhsnwaOIMOtoSqX4geVy,1,5,5,1,4051,4355,976.613565,885.84375,329.0,241.107925,18.563896,0.24108,0.218673,0.081215,6e-05,5.4e-05,2e-05,74979.609375,2,2,4,1,False,281,3316
3,gsd4FIhsnwaOIMOtoSqX4geVy,1,6,3,1,1432,6039,478.125316,256.78125,300.0,65.496732,81.348974,0.333886,0.179317,0.209497,0.000233,0.000125,0.000146,18163.40625,2,2,3,1,False,2264,3262
4,gsd4FIhsnwaOIMOtoSqX4geVy,1,7,4,1,3648,4513,645.698909,551.71875,306.0,177.745514,71.096966,0.177001,0.151239,0.083882,4.9e-05,4.1e-05,2.3e-05,91599.75,2,2,7,1,False,417,3670


In [5]:
len(df)

6944

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6944 entries, 0 to 6943
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Session                     6944 non-null   object 
 1   indicator_fg                6944 non-null   int64  
 2   Page                        6944 non-null   int64  
 3   Likert_value                6944 non-null   object 
 4   Big5                        6944 non-null   int64  
 5   time                        6944 non-null   int64  
 6   duration                    6944 non-null   int64  
 7   distance                    6944 non-null   float64
 8   x_axis_distance             6944 non-null   float64
 9   y_axis_distance             6944 non-null   float64
 10  real_ideal_trajectory_diff  6898 non-null   float64
 11  max_deviation               6898 non-null   float64
 12  velocity                    6944 non-null   float64
 13  x_axis_vel                  6944 

In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

df = df[df['Big5'] == 1]

columns_metrics = ['Likert_value', 'time', 'duration', 'distance',
       'x_axis_distance', 'y_axis_distance', 'real_ideal_trajectory_diff',
       'max_deviation', 'velocity', 'x_axis_vel', 'y_axis_vel', 'acc',
       'x_axis_acc', 'y_axis_acc', 'auc_diff', 'clicks', 'x_flips', 'y_flips',
       'visits', 'init_time', 'react_time']

# scaler = MinMaxScaler()
# scaler = StandardScaler()

# df[columns_metrics] = scaler.fit_transform(df[columns_metrics])
# df[:5]

# Random state

In [8]:
# Seed must be between 0 and 2**32 - 1

# random_state = random.randint(0, 2**32 - 1)

random_state = 569229619

print(random_state)
random.seed(random_state)

569229619


# Divide: Test & Train

In [9]:
df_to_train = df

## Select

In [10]:
def select_sessions(lst, n = 18):
    i = 0
    selected = []
    while i < n:
        element = random.choice(lst)
        if element not in selected:
            selected.append(element)
            i += 1
    return selected

In [11]:
# random test selection

fg_list_sessions = list(set(list(df[df['indicator_fg'] == 1]['Session'])))
fg_list_sessions.sort()
h_list_sessions = list(set(list(df[df['indicator_fg'] == 0]['Session'])))
h_list_sessions.sort()

fg_selected = select_sessions(fg_list_sessions)
h_selected = select_sessions(h_list_sessions)

print(fg_selected)
print(h_selected)
all_selected = fg_selected + h_selected

['rBwaeMLAX9nPiQy9BieuiFiAO', 'WX6bamzNklg6PlFfKupotk2FW', '34OsjvpQRQYY2xLKiMczMVFRE', 'VajALjCapZgxxkAzTWNDVWXOB', '903G6972fkr5z0n04P3NfFjIS', '93npiDWQK6VipNjEWLfWafqsr', 'QJpzM76mOzzx41jlZ2cD68ZVr', 'oHWOYzMTNwQN7gGNuwwrc2q58', 'aGWX89ZStczYODlw7byX9cSce', 'ynePoKNeqa1CHW0m54pd86SY1', 'bFWJvlSjZLbgznUva0d86yGwS', 'KOq8m7kgSimUpV4gORbAX574C', 'TZcocph2t67jhKfcZSdo0BBaU', 'MU3B7xrha3Nr3DmLVeeCy7vGG', 'J84AyKOw3eXAhQAE9iZYAYCxB', 'kDtZ55pukriLSh8VOnV6D5h3N', 'niEXFrlkmqE1vexhOLRNtF7uY', 'GIszPEY36M0Y6Zaq2mJQYE2Ev']
['lFkmX9UFPQu87VmX3QOtjZ3zK', 'UAytqC2udrP0JcwjdcjhhCNw8', 'KDWak467FmFZN321qc8Qumwhc', 'z5R93zFJCrmP8NAk3yypRkpiz', '8aOLee8TtAaJYBkJgJrQbqw6g', 'D5fYqE7ofhnJGZih8TLcEhdyD', 'n0OYGf6ieNDHT94qgjog5peBJ', 'B9f22juxHTJ70W0mINyPifnCJ', 'OK0LALqyDvRa2oBp6nnkjdks9', '99TUULTfZzD3H9F0bzzHus3c4', '05b1eCn35mhtHiwd3sQNGd0am', 'tseRJkAbZhFUXHnhjPWHBhHmF', 'L6IhgGIgpITGDZBBihX1d92Uv', 'pjbcxDZTIqBgwJId1zjBVZQbS', 'kWXMrfT5jl0TFxhgstlFq9PkE', 'uwMvzBSDsgBHsj50M3MLQmOaw', 'RBPuYafOHOI

In [12]:
df_to_test = df_to_train[df_to_train['Session'].isin(all_selected)]
df_to_train = df_to_train[~df_to_train['Session'].isin(all_selected)]
print(len(df_to_test), len(df_to_train), len(df), len(df_to_test)+len(df_to_train))

1728 3648 5376 5376


In [13]:
df_to_train = df_to_train[df_to_train['Big5'] == 1]
df_to_train = df_to_train.drop(["Session", "scrolling", "Big5"], axis=1)
df_to_train = df_to_train.dropna()

df_to_test = df_to_test[df_to_test['Big5'] == 1]
df_to_test = df_to_test.dropna()

result_test = df_to_test.to_json(orient="records")
with open(CONSTANT_DATASET_FOR_VALIDATION + 'df_to_test.json', 'w') as outfile:
    outfile.write(result_test)
    
result_train = df_to_train.to_json(orient="records")
with open(CONSTANT_DATASET_FOR_VALIDATION + 'df_to_train.json', 'w') as outfile:
    outfile.write(result_train) 

## Feature selection

The following code is insipred by official documentation.

In [14]:
X_train_lasso = df_to_train.drop(['indicator_fg', 'Page'], axis=1)
y_train_lasso = df_to_train['indicator_fg']

lsvc = LinearSVC(C=0.03, penalty="l1", dual=False).fit(X_train_lasso, y_train_lasso)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X_train_lasso)
X_new.shape



(3630, 12)

In [15]:
selected_features = X_train_lasso.columns[(model.get_support())]
selected_features = list(selected_features)
selected_features

['Likert_value',
 'time',
 'duration',
 'distance',
 'x_axis_distance',
 'y_axis_distance',
 'real_ideal_trajectory_diff',
 'max_deviation',
 'y_flips',
 'visits',
 'init_time',
 'react_time']

In [16]:
selected_features = ['Likert_value', 'max_deviation', 'velocity', 'x_axis_vel', 'x_axis_acc', 'auc_diff', 'y_flips', 'visits', 'init_time']

## Only t test passed features

In [17]:
df_to_test2 = df_to_test.drop(["Session", "scrolling", "Big5"], axis=1)
df_to_test2 = df_to_test[selected_features + ["indicator_fg"]]
df_to_train = df_to_train[selected_features + ["indicator_fg"]]

## Divide

In [18]:
# df_to_train = df_to_train[selected_features + ["indicator_fg"]]
# X_train, X_test, y_train, y_test = train_test_split(df_to_train.drop(["indicator_fg"], axis=1), 
#                                                     df_to_train['indicator_fg'], test_size=0.10,
#                                                    random_state=random_state)

In [19]:
X_train = df_to_train.drop(["indicator_fg"], axis=1)
X_test = df_to_test2.drop(["indicator_fg"], axis=1)
y_train = df_to_train['indicator_fg']
y_test = df_to_test2['indicator_fg']

In [20]:
result_X_train = X_train.to_json(orient="records")
with open(CONSTANT_DATASET_FOR_VALIDATION + 'X_train.json', 'w') as outfile:
    outfile.write(result_X_train)
    
result_X_test = X_test.to_json(orient="records")
with open(CONSTANT_DATASET_FOR_VALIDATION + 'X_test.json', 'w') as outfile:
    outfile.write(result_X_test)
    
result_y_train = y_train.to_json(orient="records")
with open(CONSTANT_DATASET_FOR_VALIDATION + 'y_train.json', 'w') as outfile:
    outfile.write(result_y_train)

result_y_test = y_test.to_json(orient="records")
with open(CONSTANT_DATASET_FOR_VALIDATION + 'y_test.json', 'w') as outfile:
    outfile.write(result_y_test)

# Machine Learning for metrics over **1 row = 1 pageview** dataframe

The following function is taken from my project developed on the subject Intelligent Data Analysis 2021/2022.

In [21]:
def report_generator(pred_train, pred_test, y_train, y_test, driver_silent, zero_division='warn'):
    if not driver_silent:
        print("Predicting for train dataset:")
        print(classification_report(y_train, pred_train, zero_division=zero_division))

        print("Predicting for test dataset:")
        print(classification_report(y_test, pred_test, zero_division=zero_division))
    
    report_train = classification_report(y_train, pred_train, output_dict=True, zero_division=zero_division)
    report_test = classification_report(y_test, pred_test, output_dict=True, zero_division=zero_division)
    
    return report_train, report_test

## Logistic Regression

In [22]:
X_train

Unnamed: 0,Likert_value,max_deviation,velocity,x_axis_vel,x_axis_acc,auc_diff,y_flips,visits,init_time
0,4,539.568132,0.287963,0.255350,0.000018,50776.687500,8,1,261
1,5,123.642760,0.530271,0.504672,0.000206,161737.125000,1,1,470
2,5,18.563896,0.241080,0.218673,0.000054,74979.609375,4,1,281
3,3,81.348974,0.333886,0.179317,0.000125,18163.406250,3,1,2264
4,4,71.096966,0.177001,0.151239,0.000041,91599.750000,7,1,417
...,...,...,...,...,...,...,...,...,...
6925,4,224.092544,0.227982,0.201875,0.000038,82819.848633,1,1,2174
6926,4,84.937991,0.220351,0.194159,0.000044,126346.655273,3,1,539
6927,4,107.518036,0.280394,0.225623,0.000065,127453.637695,6,1,336
6928,4,282.290524,0.160082,0.128325,0.000013,60848.583984,10,1,2201


In [23]:
from sklearn.linear_model import LogisticRegression

def LogisticRegressionDriver(X_train, X_test, y_train, y_test, driver_silent=True):
    clf = LogisticRegression(max_iter=2000000, penalty='none', random_state=random_state)
    clf.fit(X_train, y_train)
    
    pred_train = clf.predict(X_train)
    pred_test = clf.predict(X_test)
    
    return clf, *report_generator(pred_train, pred_test, y_train, y_test, driver_silent)

In [24]:
clf5, train_report5, test_report5 = LogisticRegressionDriver(X_train, X_test, y_train, y_test, driver_silent=False)

Predicting for train dataset:
              precision    recall  f1-score   support

           0       0.49      0.19      0.28      1864
           1       0.48      0.79      0.60      1766

    accuracy                           0.48      3630
   macro avg       0.49      0.49      0.44      3630
weighted avg       0.49      0.48      0.43      3630

Predicting for test dataset:
              precision    recall  f1-score   support

           0       0.49      0.20      0.29       858
           1       0.50      0.79      0.61       856

    accuracy                           0.50      1714
   macro avg       0.49      0.50      0.45      1714
weighted avg       0.49      0.50      0.45      1714



In [25]:
params = {
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'], 
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'fit_intercept': [True, False]
}

In [26]:
cls = LogisticRegression(max_iter=2000000)

In [27]:
# df_cross_val = df[df['Big5'] == 1]
# df_cross_val = df_cross_val[selected_features + ["indicator_fg"]]
# df_cross_val = df_cross_val.dropna()

X = df_to_train.drop(["indicator_fg"], axis=1)
y = df_to_train["indicator_fg"]

In [28]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
gridCV = GridSearchCV(cls, params, cv=3, verbose=4, n_jobs=-1)

In [29]:
gridCV.fit(X, y)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


54 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\PeterSmrecek\Documents\BP\BP-venv\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\PeterSmrecek\Documents\BP\BP-venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\PeterSmrecek\Documents\BP\BP-venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none

GridSearchCV(cv=3, estimator=LogisticRegression(max_iter=2000000), n_jobs=-1,
             param_grid={'fit_intercept': [True, False],
                         'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             verbose=4)

In [30]:
gridCV.best_score_

0.6044077134986225

In [31]:
bestGrid = gridCV.best_estimator_
bestGrid

LogisticRegression(max_iter=2000000, penalty='l1', solver='liblinear')

In [32]:
gridPred_train = bestGrid.predict(X_train)
gridPred = bestGrid.predict(X_test)

In [33]:
print(classification_report(y_train, gridPred_train)) 

              precision    recall  f1-score   support

           0       0.64      0.59      0.62      1864
           1       0.60      0.64      0.62      1766

    accuracy                           0.62      3630
   macro avg       0.62      0.62      0.62      3630
weighted avg       0.62      0.62      0.62      3630



In [34]:
print(classification_report(y_test, gridPred)) 

              precision    recall  f1-score   support

           0       0.65      0.62      0.64       858
           1       0.64      0.67      0.65       856

    accuracy                           0.64      1714
   macro avg       0.64      0.64      0.64      1714
weighted avg       0.64      0.64      0.64      1714

