In [1]:

import pandas as pd
import os
from dotenv import load_dotenv
import plotly.express as px
from collections import Counter
from scipy.stats import chi2_contingency
import scipy.stats as stats
from urllib.parse import urlparse
from scipy.stats import kruskal
from scipy.stats import levene

In [2]:
load_dotenv()
data_folder = os.getenv('OUTPUT_FOLDER')

In [3]:
def import_data(data_folder, file, user):
    # Import transformed data 

    data = pd.ExcelFile(f'{data_folder + file}')

    sheet_names = data.sheet_names

    # Create an empty dictionary to store the DataFrames
    dct = {}

    # Iterate through each sheet name and load each sheet as a DataFrame
    for sheet in sheet_names:
        # Read each sheet by its name
        df = pd.read_excel(data, sheet_name=sheet.strip())
        df.drop(columns=['Unnamed: 0'], inplace=True)
        # add target column 
        if user == 'human':
            df['target'] = 'human'
        elif user == 'adv_bot':
            df['target'] = 'adv_bot'
        elif user == 'mod_bot':
            df['target'] = 'mod_bot'
        # Store the DataFrame in the dictionary with the sheet name as the key
        dct[sheet] = df
    return dct

In [4]:
humans_dct = import_data(data_folder, '/humans_transformed_data.xlsx', 'human')
mod_dct = import_data(data_folder, '/mod_bot_transformed_data.xlsx', 'mod_bot')
adv_dct = import_data(data_folder, '/adv_bot_transformed_data.xlsx', 'adv_bot') 

In [5]:
humans_dct['5f0221209dbc2279e9e1db9d']

Unnamed: 0,_id,session_id,unique_id,mousemove_client_height_width,mousemove_times,mousemove_type,mousemove_height_width,mousemove_visited_urls,target
0,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",1593975071137,m,"(456,701)",http://160.40.52.164/,human
1,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",1593975071228,m,"(451,700)",http://160.40.52.164/,human
2,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",1593975071247,m,"(436,707)",http://160.40.52.164/,human
3,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",1593975071261,m,"(414,718)",http://160.40.52.164/,human
4,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",1593975071278,m,"(404,731)",http://160.40.52.164/,human
...,...,...,...,...,...,...,...,...,...
34334,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",1593976648176,m,"(454,919)",http://160.40.52.164/content/research.php,human
34335,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",1593976648193,m,"(454,921)",http://160.40.52.164/content/research.php,human
34336,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",1593976648210,m,"(454,924)",http://160.40.52.164/content/research.php,human
34337,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",1593976648227,m,"(454,932)",http://160.40.52.164/content/research.php,human


In [20]:
humans_dct['5f0221209dbc2279e9e1db9d'][humans_dct['5f0221209dbc2279e9e1db9d']['mousemove_type'] == 'cl']

Unnamed: 0,_id,session_id,unique_id,mousemove_client_height_width,mousemove_times,mousemove_type,mousemove_height_width,mousemove_visited_urls,target
238,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",975082250,cl,"(104,190)",http://160.40.52.164/,human
2409,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",975201717,cl,"(275,579)",http://160.40.52.164/content/big_data.php,human
2410,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",975201897,cl,"(275,579)",http://160.40.52.164/content/big_data.php,human
2453,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",975203008,cl,"(576,520)",http://160.40.52.164/content/big_data.php,human
11988,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",975708559,cl,"(215,598)",http://160.40.52.164/content/big_data/data_ana...,human
...,...,...,...,...,...,...,...,...,...
32399,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",976595803,cl,"(281,584)",http://160.40.52.164/content/research.php,human
33657,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",976629976,cl,"(996,262)",http://160.40.52.164/content/research/operatio...,human
33933,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",976635743,cl,"(1582,277)",http://160.40.52.164/content/research/operatio...,human
34221,5f0221209dbc2279e9e1db9d,koftcfkosigp4f2ibgfi0dm0co,694842305985,"(938, 1920)",976641519,cl,"(1191,282)",http://160.40.52.164/content/research/operatio...,human


We parse the domain names as we do not want domain names to interfere with the judgement of whether a url visit is by a bot or not. The set-up of the experiment is that different users already are given different domains.

In [7]:
def merge_url_columns(dct):
    # Extract the 'URL' column from each DataFrame and merge into a single list
    merged_list = []
    for df in dct.values():  # Iterate over DataFrames in the dictionary
        merged_list.extend(df['mousemove_visited_urls'].tolist())  # Convert column to list and extend
    for i in range(len(merged_list)):
        parsed_url = urlparse(merged_list[i])
        merged_list[i] = parsed_url.path.lstrip('/')
    merged_list = [x for x in merged_list if x != ''] # We remove instances where only the root domain is visited.
    return merged_list

In [8]:
humans_url_lst = merge_url_columns(humans_dct)
mod_url_lst = merge_url_columns(mod_dct)
adv_url_lst = merge_url_columns(adv_dct)

In [9]:
# Count occurrences
human_counts = Counter(humans_url_lst)

# Convert to DataFrame
human_url_cnt_df = pd.DataFrame(human_counts.items(), columns=['URL', 'Count'])

human_url_cnt_df['% of All Visits'] = round((human_url_cnt_df['Count']/sum(human_counts.values()))* 100, 2)

human_url_cnt_df.sort_values(by='Count', ascending = False).iloc[:10]



Unnamed: 0,URL,Count,% of All Visits
0,content/big_data.php,58039,9.53
13,content/cryptocurrency.php,41603,6.83
19,content/web_bots.php,37434,6.14
14,content/machine_learning.php,31456,5.16
42,content/computer_security/vulnerabilities.php,30875,5.07
2,content/big_data/data_analysis.php,28459,4.67
17,content/research.php,22952,3.77
36,content/big_data/rdbms.php,20201,3.32
12,content/cryptography.php,19567,3.21
9,content/data_management.php,18453,3.03


In [10]:
# Count occurrences
mod_counts = Counter(mod_url_lst)

# Convert to DataFrame
mod_url_cnt_df = pd.DataFrame(mod_counts.items(), columns=['URL', 'Count'])
mod_url_cnt_df['% of All Visits'] = round((mod_url_cnt_df['Count']/sum(mod_counts.values()))* 100, 2)


mod_url_cnt_df.sort_values(by='Count', ascending = False).iloc[:10]

Unnamed: 0,URL,Count,% of All Visits
19,content/big_data.php,66353,9.28
7,content/web_bots.php,58805,8.22
5,content/computer_security.php,56640,7.92
12,content/machine_learning.php,49190,6.88
8,content/cryptography.php,41497,5.8
14,content/cryptocurrency.php,38309,5.36
11,content/computer_networks.php,32428,4.53
3,content/data_management.php,31777,4.44
0,index.php,25607,3.58
1,content/software_engineering.php,23707,3.31


In [11]:
# Count occurrences
adv_counts = Counter(adv_url_lst)

# Convert to DataFrame
adv_url_cnt_df = pd.DataFrame(adv_counts.items(), columns=['URL', 'Count'])
adv_url_cnt_df['% of All Visits'] = round((adv_url_cnt_df['Count']/sum(adv_counts.values()))* 100, 2)

adv_url_cnt_df.sort_values(by='Count', ascending = False).iloc[:10]

Unnamed: 0,URL,Count,% of All Visits
5,content/cryptocurrency.php,95751,8.2
11,content/web_bots.php,94135,8.06
6,content/machine_learning.php,91989,7.88
0,content/big_data.php,86542,7.41
18,content/cryptography.php,85446,7.32
8,content/research.php,77875,6.67
14,content/data_management.php,75867,6.5
2,content/computer_networks.php,73279,6.28
23,content/software_engineering.php,67490,5.78
1,content/computer_security.php,39698,3.4


In [12]:
human_url_cnt_df['user'] = 'human'
mod_url_cnt_df['user'] = 'mod_bot'
adv_url_cnt_df['user'] = 'adv_bot'

In [13]:
all_cnt = pd.concat([human_url_cnt_df, mod_url_cnt_df, adv_url_cnt_df])

We verify significance of difference between different user groups using ANOVA.

In [14]:
# Group by 'user_group' and gather 'visit_count' for each group
grouped = [group['Count'].values for _, group in all_cnt.groupby('user')]

# Perform one-way ANOVA
f_statistic, p_value = stats.f_oneway(*grouped)

print("F-statistic:", f_statistic)
print("P-value:", p_value)

# Decision based on p-value
if p_value < 0.05:
    print("There is a significant difference between user groups.")
else:
    print("There is no significant difference between user groups.")

F-statistic: 3.5500139418902164
P-value: 0.030108714253593766
There is a significant difference between user groups.


However, because ANOVA assumes a normal distribution of counts, we check again using the Kruskal-Wallis Test

In [15]:
# Assuming data is in a list format where each list corresponds to one group
group1 = all_cnt[all_cnt['user'] == 'human']['Count']
group2 = all_cnt[all_cnt['user'] == 'mod_bot']['Count']
group3 = all_cnt[all_cnt['user'] == 'adv_bot']['Count']

# Kruskal-Wallis Test
stat, p_value = kruskal(group1, group2, group3)
print(f"Kruskal-Wallis p-value: {p_value}")

Kruskal-Wallis p-value: 0.05599479891304758


We check to see if the variances are significantly different.

In [16]:
# Levene's Test for Equality of Variances
stat, p_value = levene(group1, group2, group3)
print(f"Levene's test p-value: {p_value}")

Levene's test p-value: 0.03552386543388538


In [17]:

from scipy.stats import ttest_ind

# Compare Humans vs Moderate Bots
stat, p_value = ttest_ind(group1, group2, equal_var=False)
print(f"Welch's ANOVA (Humans vs Moderate Bots): p-value = {p_value}")

# Compare Humans vs Advanced Bots
stat, p_value = ttest_ind(group1, group3, equal_var=False)
print(f"Welch's ANOVA (Humans vs Advanced Bots): p-value = {p_value}")

# Compare Moderate Bots vs Advanced Bots
stat, p_value = ttest_ind(group2, group3, equal_var=False)
print(f"Welch's ANOVA (Moderate Bots vs Advanced Bots): p-value = {p_value}")

Welch's ANOVA (Humans vs Moderate Bots): p-value = 0.06773562623978616
Welch's ANOVA (Humans vs Advanced Bots): p-value = 0.015926313236221838
Welch's ANOVA (Moderate Bots vs Advanced Bots): p-value = 0.3194659178696738


In [18]:
from scipy.stats import f_oneway

f_stat, p_value = f_oneway(group1, group2, group3)

print(f"One-way ANOVA across all groups: p-value = {p_value}")

One-way ANOVA across all groups: p-value = 0.030108714253593766
