# Import Libraries

In [1]:
import zipfile
import re
import os
import json
import numpy as np
import pandas as pd


# 1. Unzip zip folder containing raw data 

In [2]:
# Define the zip file and destination folder
zip_path = r'data/raw/web_bot_detection_dataset.zip'  
extract_folder = "data/raw/web_bot_detection_dataset/"  

# Unzip the folder
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print(f"Extracted to {extract_folder}")

Extracted to data/raw/web_bot_detection_dataset/


# 2. Data preprocessing and cleaning

Data generated and saved in interim folder: 
- mousemovement_data.csv
- weblog_data.csv

## Mouse movement data

### Phase 1

Reading in mousemovement data and converting to dataframe

In [3]:
# reading in mouse movement data
base_dir_moderate_phase1 = "data/raw/web_bot_detection_dataset/web_bot_detection_dataset/phase1/data/mouse_movements/humans_and_moderate_bots"
base_dir_advanced_phase1 = "data/raw/web_bot_detection_dataset/web_bot_detection_dataset/phase1/data/mouse_movements/humans_and_advanced_bots"

def load_mouse_data(base_dir):
    subfolders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

    all_mouse_data = []

    for subfolder in subfolders:
        json_file_path = os.path.join(base_dir, subfolder, "mouse_movements.json")
        if os.path.exists(json_file_path):
            with open(json_file_path, 'r') as f:
                try:
                    data = json.load(f)
                    all_mouse_data.append(data)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON for file {json_file_path}: {e}")

    return all_mouse_data

moderate_data = load_mouse_data(base_dir_moderate_phase1)
advanced_data = load_mouse_data(base_dir_advanced_phase1)

all_mouse_data = moderate_data + advanced_data

table_data = []


for entry in all_mouse_data:
    session_id = entry.get('session_id', '')
    total_behaviour = entry.get('total_behaviour', '')
    mousemove_times = entry.get('mousemove_times', '')
    mousemove_total_behaviour = entry.get('mousemove_total_behaviour', '')

    movements = []
    clicks = {'left': 0, 'right': 0, 'middle': 0}

    if total_behaviour:
        total_behaviour_split = total_behaviour.split('][')

        for part in total_behaviour_split:
            if part.startswith('m('):  # coords
                movements.append(part[2:-1].split(','))
            elif part.startswith('c('):  # clicks
                click_type = part[2:-1]
                if click_type == 'l':
                    clicks['left'] += 1
                elif click_type == 'r':
                    clicks['right'] += 1
                elif click_type == 'm':
                    clicks['middle'] += 1

    table_data.append({
        'session_id': session_id,
        'total_behaviour': total_behaviour,
        'mousemove_times': mousemove_times,
        'mousemove_total_behaviour': mousemove_total_behaviour,
        'mouse_movements': movements,
        'clicks': clicks
    })

mousemovement_phase1_df = pd.DataFrame(table_data)

mousemovement_phase1_df.head()

Unnamed: 0,session_id,total_behaviour,mousemove_times,mousemove_total_behaviour,mouse_movements,clicks
0,os9be9s5er1ud8569gm26htp0j,"[m(0,4)][m(0,5)][m(0,6)][m(0,7)][m(1,8)][m(1,9...","444694243,444694260,444694276,444694293,444694...","[0,4][0,5][0,6][0,7][1,8][1,9][1,10][1,11][2,1...","[[0, 5], [0, 6], [0, 7], [1, 8], [1, 9], [1, 1...","{'left': 10, 'right': 0, 'middle': 0}"
1,7onurvslijk8fm97iohvhcoq52,"[m(0,4)][m(0,5)][m(0,6)][m(0,7)][m(1,8)][m(1,9...","440256452,440256468,440256485,440256502,440256...","[0,4][0,5][0,6][0,7][1,8][1,9][1,10][1,11][2,1...","[[0, 5], [0, 6], [0, 7], [1, 8], [1, 9], [1, 1...","{'left': 17, 'right': 0, 'middle': 0}"
2,7f2bg9n48opuf4ligvtc1dsopr,"[m(582,572)][m(580,571)][m(579,568)][m(574,565...","217622919,217622927,217622940,217622956,217622...","[582,572][580,571][579,568][574,565][572,560][...","[[580, 571], [579, 568], [574, 565], [572, 560...","{'left': 14, 'right': 0, 'middle': 0}"
3,4mae49mh8va4g03vjatfn6np9b,"[m(646,569)][m(646,563)][m(646,557)][m(645,550...","214931615,214931634,214931646,214931663,214931...","[646,569][646,563][646,557][645,550][643,543][...","[[646, 563], [646, 557], [645, 550], [643, 543...","{'left': 12, 'right': 0, 'middle': 0}"
4,jfmilo33fin84baeh3k6bcnh3v,"[m(0,4)][m(0,5)][m(0,6)][m(0,7)][m(1,8)][m(1,9...","445720265,445720281,445720298,445720315,445720...","[0,4][0,5][0,6][0,7][1,8][1,9][1,10][1,11][2,1...","[[0, 5], [0, 6], [0, 7], [1, 8], [1, 9], [1, 1...","{'left': 15, 'right': 0, 'middle': 0}"


Reading in Annotation data

In [4]:
# Annotations data consisting of label mapping of phase 1 mousemovement data

# merge all the train test files together
base_path_phase1 = "data/raw/web_bot_detection_dataset/web_bot_detection_dataset/phase1/annotations"

mod_train_phase1 = pd.read_csv(base_path_phase1 + "/humans_and_moderate_bots/train", sep=" ", header=None, names=["id", "label"])
mod_test_phase1 = pd.read_csv(base_path_phase1 + "/humans_and_moderate_bots/test", sep=" ", header=None, names=["id", "label"])
adv_train_phase1 = pd.read_csv(base_path_phase1 + "/humans_and_advanced_bots/train", sep=" ", header=None, names=["id", "label"])
adv_test_phase1 = pd.read_csv(base_path_phase1 + "/humans_and_advanced_bots/test", sep=" ", header=None, names=["id", "label"])

label_df_phase1 = pd.concat([mod_train_phase1, mod_test_phase1, adv_train_phase1, adv_test_phase1], ignore_index=True)

label_df_phase1.head()

Unnamed: 0,id,label
0,dr09rk5eagjuu87gedvdqmq3gl,human
1,gq715ms79515gcq39vf91mli6t,human
2,hrbko2t4t14q3pahqltndlolb5,human
3,nvmlnfhs5v6hehsd81e9mf75cn,human
4,brrlh9tmiodt2ekkjvn7kcsps0,human


Mapping labels to each record in mousemovement data

In [5]:
# mapping labels to mousemovement data
mousemovement_phase1_df = mousemovement_phase1_df.merge(label_df_phase1, left_on="session_id", right_on="id", how="left")
mousemovement_phase1_df.drop(columns=["id"], inplace=True)

mousemovement_phase1_df.head()

Unnamed: 0,session_id,total_behaviour,mousemove_times,mousemove_total_behaviour,mouse_movements,clicks,label
0,os9be9s5er1ud8569gm26htp0j,"[m(0,4)][m(0,5)][m(0,6)][m(0,7)][m(1,8)][m(1,9...","444694243,444694260,444694276,444694293,444694...","[0,4][0,5][0,6][0,7][1,8][1,9][1,10][1,11][2,1...","[[0, 5], [0, 6], [0, 7], [1, 8], [1, 9], [1, 1...","{'left': 10, 'right': 0, 'middle': 0}",moderate_bot
1,7onurvslijk8fm97iohvhcoq52,"[m(0,4)][m(0,5)][m(0,6)][m(0,7)][m(1,8)][m(1,9...","440256452,440256468,440256485,440256502,440256...","[0,4][0,5][0,6][0,7][1,8][1,9][1,10][1,11][2,1...","[[0, 5], [0, 6], [0, 7], [1, 8], [1, 9], [1, 1...","{'left': 17, 'right': 0, 'middle': 0}",moderate_bot
2,7f2bg9n48opuf4ligvtc1dsopr,"[m(582,572)][m(580,571)][m(579,568)][m(574,565...","217622919,217622927,217622940,217622956,217622...","[582,572][580,571][579,568][574,565][572,560][...","[[580, 571], [579, 568], [574, 565], [572, 560...","{'left': 14, 'right': 0, 'middle': 0}",human
3,7f2bg9n48opuf4ligvtc1dsopr,"[m(582,572)][m(580,571)][m(579,568)][m(574,565...","217622919,217622927,217622940,217622956,217622...","[582,572][580,571][579,568][574,565][572,560][...","[[580, 571], [579, 568], [574, 565], [572, 560...","{'left': 14, 'right': 0, 'middle': 0}",human
4,4mae49mh8va4g03vjatfn6np9b,"[m(646,569)][m(646,563)][m(646,557)][m(645,550...","214931615,214931634,214931646,214931663,214931...","[646,569][646,563][646,557][645,550][643,543][...","[[646, 563], [646, 557], [645, 550], [643, 543...","{'left': 12, 'right': 0, 'middle': 0}",human


In [6]:
# change mousemove_times to arr
mousemovement_phase1_df["mousemove_times"] = mousemovement_phase1_df["mousemove_times"].apply(lambda x: [int(t) for t in x.split(",") if t.strip()])

### Phase 2

### Combined

## Web log data

Columns in web log data after slicing
1. datetime - date and time of request
2. method - HTTP request method 
3. url - specific resource or endpoint being requested in the HTTP request
4. protocol - specifies the protocol used for communication between the client and the server
5. status - HTTP response status code sent by the server to indicate the result of the request
6. byte_size - size (in bytes) of the response body returned by the server
7. referrer - URL of the web page that linked to the requested resource
8. Session_ID - unique identifier for the user’s session
9. user_agent - details about the user's device, operating system, and browser

In [7]:
# directory to raw web log data
base_dir_phase1_weblog = 'data/raw/web_bot_detection_dataset/web_bot_detection_dataset/phase1/data/web_logs'
base_dir_phase2_weblog = 'data/raw/web_bot_detection_dataset/web_bot_detection_dataset/phase2/data/web_logs'

In [8]:
# combined all weblog json files and separate them into columns
def web_log_df(directory):
    humans_web_log_dir = directory + '/humans'
    bots_web_log_dir = directory + '/bots'
    # Apache log format
    log_pattern = re.compile(
        r'- - \[(?P<datetime>[^]]+)\] "(?P<method>\S+) (?P<url>\S+) (?P<protocol>\S+)" (?P<status>\d+) (?P<byte_size>\d+) "(?P<referrer>[^"]*)" (?P<Session_ID>\S+) "(?P<user_agent>[^"]+)"'
    )
    parsed_logs = []
    for i in [humans_web_log_dir,bots_web_log_dir]:
        # label the records based on folder name
        if i == humans_web_log_dir:
            label = "humans" 
        else:
            label = "bots"
        for log in os.listdir(i):
            with open(i + '/' + log, 'r') as file:
                log_entries = file.readlines()
            for log in log_entries:
                match = log_pattern.match(log)
                if match:
                    log_data = match.groupdict()
                    log_data["category"] = label 
                    parsed_logs.append(log_data)

    df = pd.DataFrame(parsed_logs)
    df['datetime'] = pd.to_datetime(df['datetime'], format="%d/%b/%Y:%H:%M:%S %z").dt.tz_localize(None)
    return df

In [9]:
# generate the processed web log data for each phase
web_log_phase1 = web_log_df(base_dir_phase1_weblog)
web_log_phase2 = web_log_df(base_dir_phase2_weblog)

# merge data from both phases together
weblog_data = pd.concat([web_log_phase1, web_log_phase2], ignore_index=True)

weblog_data.head()

Unnamed: 0,datetime,method,url,protocol,status,byte_size,referrer,Session_ID,user_agent,category
0,2019-10-24 07:45:52,GET,/,HTTP/1.1,200,2712,-,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
1,2019-10-24 07:45:52,GET,/css/main.css,HTTP/1.1,200,764,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
2,2019-10-24 07:45:52,GET,/js/initialise_vars.js,HTTP/1.1,200,770,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
3,2019-10-24 07:45:52,GET,/js/cookies_functions.js,HTTP/1.1,200,1011,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
4,2019-10-24 07:45:52,GET,/js/mousemove_onclick.js,HTTP/1.1,200,2167,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans


# 3. Feature Engineering

Data generated and saved in interim folder: 
- final_mousemovement_data.csv
- final_weblog_data.csv

# 4. Merging of data

Data generated and saved in processed folder: 
- data.csv