# mouse p1 initial code

In [None]:
from google.colab import drive
import zipfile
import os
import json
import numpy as np
import pandas as pd
import ast
import re

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/DSA4263


/content/drive/MyDrive/DSA4263


In [None]:
# reading in mouse movement data
base_dir_moderate = "/content/drive/MyDrive/DSA4263/mouse_movements/humans_and_moderate_bots"
base_dir_advanced = "/content/drive/MyDrive/DSA4263/mouse_movements/humans_and_advanced_bots"

def load_mouse_data(base_dir, category_label):
    subfolders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

    all_mouse_data = []

    for subfolder in subfolders:
        json_file_path = os.path.join(base_dir, subfolder, "mouse_movements.json")
        if os.path.exists(json_file_path):
            with open(json_file_path, 'r') as f:
                try:
                    data = json.load(f)
                    data['category'] = category_label
                    all_mouse_data.append(data)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON for file {json_file_path}: {e}")

    return all_mouse_data

moderate_data = load_mouse_data(base_dir_moderate, 'moderate')
advanced_data = load_mouse_data(base_dir_advanced, 'advanced')

all_mouse_data = moderate_data + advanced_data

table_data = []


for entry in all_mouse_data:
    session_id = entry.get('session_id', '')
    total_behaviour = entry.get('total_behaviour', '')
    mousemove_times = entry.get('mousemove_times', '')
    mousemove_total_behaviour = entry.get('mousemove_total_behaviour', '')
    category = entry.get('category', '')

    movements = []
    clicks = {'left': 0, 'right': 0, 'middle': 0}

    if total_behaviour:
        total_behaviour_split = total_behaviour.split('][')

        for part in total_behaviour_split:
            if part.startswith('m('):  # coords
                movements.append(part[2:-1].split(','))
            elif part.startswith('c('):  # clicks
                click_type = part[2:-1]
                if click_type == 'l':
                    clicks['left'] += 1
                elif click_type == 'r':
                    clicks['right'] += 1
                elif click_type == 'm':
                    clicks['middle'] += 1

    table_data.append({
        'session_id': session_id,
        'total_behaviour': total_behaviour,
        'mousemove_times': mousemove_times,
        'mousemove_total_behaviour': mousemove_total_behaviour,
        'mouse_movements': movements,
        'clicks': clicks,
        'category': category
    })

df = pd.DataFrame(table_data)

print(df.head())
# df

                   session_id  \
0  0ht0u328t4mkgi01sp7mm07e01   
1  0p0kaqfmftrtfg0u12rkshrdi9   
2  0ogg5l1i4nh37ek8mjpn8q9fc0   
3  0sejj3er3n0v1sfjv2jbnhaav4   
4  1aqgqrcuurlmvvbbpirvsh7e53   

                                     total_behaviour  \
0  [m(0,5)][m(0,6)][m(0,7)][m(0,8)][m(0,9)][m(1,1...   
1  [m(544,545)][m(544,544)][m(543,543)][m(541,542...   
2  [m(24,0)][m(24,1)][m(24,2)][m(24,3)][m(24,4)][...   
3  [m(0,4)][m(0,5)][m(0,6)][m(0,7)][m(1,8)][m(1,9...   
4  [m(589,553)][m(589,545)][m(589,537)][m(590,530...   

                                     mousemove_times  \
0  447717248,447717265,447717281,447717298,447717...   
1  215772898,215772921,215772936,215772953,215772...   
2  443682927,443682939,443682955,443682972,443682...   
3  448128668,448128685,448128702,448128718,448128...   
4  341919607,341919621,341919637,341919654,341919...   

                           mousemove_total_behaviour  \
0  [0,5][0,6][0,7][0,8][0,9][1,10][1,11][1,12][1,...   
1  [544,545][54

In [None]:
df

Unnamed: 0,session_id,total_behaviour,mousemove_times,mousemove_total_behaviour,mouse_movements,clicks,category
0,0ht0u328t4mkgi01sp7mm07e01,"[m(0,5)][m(0,6)][m(0,7)][m(0,8)][m(0,9)][m(1,1...","447717248,447717265,447717281,447717298,447717...","[0,5][0,6][0,7][0,8][0,9][1,10][1,11][1,12][1,...","[[0, 6], [0, 7], [0, 8], [0, 9], [1, 10], [1, ...","{'left': 10, 'right': 0, 'middle': 0}",moderate
1,0p0kaqfmftrtfg0u12rkshrdi9,"[m(544,545)][m(544,544)][m(543,543)][m(541,542...","215772898,215772921,215772936,215772953,215772...","[544,545][544,544][543,543][541,542][540,541][...","[[544, 544], [543, 543], [541, 542], [540, 541...","{'left': 10, 'right': 0, 'middle': 0}",moderate
2,0ogg5l1i4nh37ek8mjpn8q9fc0,"[m(24,0)][m(24,1)][m(24,2)][m(24,3)][m(24,4)][...","443682927,443682939,443682955,443682972,443682...","[24,0][24,1][24,2][24,3][24,4][24,5][24,6][24,...","[[24, 1], [24, 2], [24, 3], [24, 4], [24, 5], ...","{'left': 8, 'right': 0, 'middle': 0}",moderate
3,0sejj3er3n0v1sfjv2jbnhaav4,"[m(0,4)][m(0,5)][m(0,6)][m(0,7)][m(1,8)][m(1,9...","448128668,448128685,448128702,448128718,448128...","[0,4][0,5][0,6][0,7][1,8][1,9][1,10][1,11][2,1...","[[0, 5], [0, 6], [0, 7], [1, 8], [1, 9], [1, 1...","{'left': 10, 'right': 0, 'middle': 0}",moderate
4,1aqgqrcuurlmvvbbpirvsh7e53,"[m(589,553)][m(589,545)][m(589,537)][m(590,530...","341919607,341919621,341919637,341919654,341919...","[589,553][589,545][589,537][590,530][592,524][...","[[589, 545], [589, 537], [590, 530], [592, 524...","{'left': 12, 'right': 0, 'middle': 0}",moderate
...,...,...,...,...,...,...,...
195,tdm49rlqprtglqndl74j4cekj0,"[m(8,6)][m(16,14)][m(24,22)][m(32,30)][m(40,38...","616752013,616752022,616752037,616752054,616752...","[8,6][16,14][24,22][32,30][40,38][48,46][56,54...","[[16, 14], [24, 22], [32, 30], [40, 38], [48, ...","{'left': 12, 'right': 0, 'middle': 0}",advanced
196,v6g56j1moj87j95r7n37hjcjqa,"[m(32,6)][m(40,14)][m(48,22)][m(56,30)][m(64,3...","615740494,615740502,615740518,615740535,615740...","[32,6][40,14][48,22][56,30][64,38][72,46][80,5...","[[40, 14], [48, 22], [56, 30], [64, 38], [72, ...","{'left': 17, 'right': 0, 'middle': 0}",advanced
197,v7qok0dv5ekcaoefquadgka6me,"[m(599,558)][m(581,545)][m(560,529)][m(536,510...","215301883,215301899,215301915,215301932,215301...","[599,558][581,545][560,529][536,510][513,492][...","[[581, 545], [560, 529], [536, 510], [513, 492...","{'left': 15, 'right': 0, 'middle': 0}",advanced
198,trga4mrvjvogdgoi9vrqkmi92j,"[m(6,7)][m(14,15)][m(22,23)][m(29,31)][m(37,39...","619080239,619080248,619080262,619080278,619080...","[6,7][14,15][22,23][29,31][37,39][45,47][52,55...","[[14, 15], [22, 23], [29, 31], [37, 39], [45, ...","{'left': 12, 'right': 0, 'middle': 0}",advanced


In [None]:
# the train test files
base_path = "/content/drive/My Drive/DSA4263/train test"

mod_train = pd.read_csv(base_path + "/humans_and_moderate_bots/train", sep=" ", header=None, names=["id", "human_or_bot"])
mod_test = pd.read_csv(base_path + "/humans_and_moderate_bots/test", sep=" ", header=None, names=["id", "human_or_bot"])
adv_train = pd.read_csv(base_path + "/humans_and_advanced_bots/train", sep=" ", header=None, names=["id", "human_or_bot"])
adv_test = pd.read_csv(base_path + "/humans_and_advanced_bots/test", sep=" ", header=None, names=["id", "human_or_bot"])

merged_df = pd.concat([mod_train, mod_test, adv_train, adv_test], ignore_index=True)

merged_df.head()

Unnamed: 0,id,human_or_bot
0,dr09rk5eagjuu87gedvdqmq3gl,human
1,gq715ms79515gcq39vf91mli6t,human
2,hrbko2t4t14q3pahqltndlolb5,human
3,nvmlnfhs5v6hehsd81e9mf75cn,human
4,brrlh9tmiodt2ekkjvn7kcsps0,human


In [None]:
# joining mouseclicker files with train test files
overall_mouseclicker_df = df.merge(merged_df, left_on="session_id", right_on="id", how="left")
overall_mouseclicker_df.drop(columns=["id"], inplace=True)

overall_mouseclicker_df.to_csv(base_path + "/overall_mouseclicker_df.csv", index=False)

overall_mouseclicker_df.head()

Unnamed: 0,session_id,total_behaviour,mousemove_times,mousemove_total_behaviour,mouse_movements,clicks,category,human_or_bot
0,0ht0u328t4mkgi01sp7mm07e01,"[m(0,5)][m(0,6)][m(0,7)][m(0,8)][m(0,9)][m(1,1...","447717248,447717265,447717281,447717298,447717...","[0,5][0,6][0,7][0,8][0,9][1,10][1,11][1,12][1,...","[[0, 6], [0, 7], [0, 8], [0, 9], [1, 10], [1, ...","{'left': 10, 'right': 0, 'middle': 0}",moderate,moderate_bot
1,0p0kaqfmftrtfg0u12rkshrdi9,"[m(544,545)][m(544,544)][m(543,543)][m(541,542...","215772898,215772921,215772936,215772953,215772...","[544,545][544,544][543,543][541,542][540,541][...","[[544, 544], [543, 543], [541, 542], [540, 541...","{'left': 10, 'right': 0, 'middle': 0}",moderate,human
2,0p0kaqfmftrtfg0u12rkshrdi9,"[m(544,545)][m(544,544)][m(543,543)][m(541,542...","215772898,215772921,215772936,215772953,215772...","[544,545][544,544][543,543][541,542][540,541][...","[[544, 544], [543, 543], [541, 542], [540, 541...","{'left': 10, 'right': 0, 'middle': 0}",moderate,human
3,0ogg5l1i4nh37ek8mjpn8q9fc0,"[m(24,0)][m(24,1)][m(24,2)][m(24,3)][m(24,4)][...","443682927,443682939,443682955,443682972,443682...","[24,0][24,1][24,2][24,3][24,4][24,5][24,6][24,...","[[24, 1], [24, 2], [24, 3], [24, 4], [24, 5], ...","{'left': 8, 'right': 0, 'middle': 0}",moderate,moderate_bot
4,0sejj3er3n0v1sfjv2jbnhaav4,"[m(0,4)][m(0,5)][m(0,6)][m(0,7)][m(1,8)][m(1,9...","448128668,448128685,448128702,448128718,448128...","[0,4][0,5][0,6][0,7][1,8][1,9][1,10][1,11][2,1...","[[0, 5], [0, 6], [0, 7], [1, 8], [1, 9], [1, 1...","{'left': 10, 'right': 0, 'middle': 0}",moderate,moderate_bot


In [None]:
overall_mouseclicker_df = overall_mouseclicker_df.drop(columns=['clicks'])
overall_mouseclicker_df

Unnamed: 0,session_id,total_behaviour,mousemove_times,mousemove_total_behaviour,mouse_movements,category,human_or_bot
0,0ht0u328t4mkgi01sp7mm07e01,"[m(0,5)][m(0,6)][m(0,7)][m(0,8)][m(0,9)][m(1,1...","447717248,447717265,447717281,447717298,447717...","[0,5][0,6][0,7][0,8][0,9][1,10][1,11][1,12][1,...","[[0, 6], [0, 7], [0, 8], [0, 9], [1, 10], [1, ...",moderate,moderate_bot
1,0p0kaqfmftrtfg0u12rkshrdi9,"[m(544,545)][m(544,544)][m(543,543)][m(541,542...","215772898,215772921,215772936,215772953,215772...","[544,545][544,544][543,543][541,542][540,541][...","[[544, 544], [543, 543], [541, 542], [540, 541...",moderate,human
2,0p0kaqfmftrtfg0u12rkshrdi9,"[m(544,545)][m(544,544)][m(543,543)][m(541,542...","215772898,215772921,215772936,215772953,215772...","[544,545][544,544][543,543][541,542][540,541][...","[[544, 544], [543, 543], [541, 542], [540, 541...",moderate,human
3,0ogg5l1i4nh37ek8mjpn8q9fc0,"[m(24,0)][m(24,1)][m(24,2)][m(24,3)][m(24,4)][...","443682927,443682939,443682955,443682972,443682...","[24,0][24,1][24,2][24,3][24,4][24,5][24,6][24,...","[[24, 1], [24, 2], [24, 3], [24, 4], [24, 5], ...",moderate,moderate_bot
4,0sejj3er3n0v1sfjv2jbnhaav4,"[m(0,4)][m(0,5)][m(0,6)][m(0,7)][m(1,8)][m(1,9...","448128668,448128685,448128702,448128718,448128...","[0,4][0,5][0,6][0,7][1,8][1,9][1,10][1,11][2,1...","[[0, 5], [0, 6], [0, 7], [1, 8], [1, 9], [1, 1...",moderate,moderate_bot
...,...,...,...,...,...,...,...
295,v6g56j1moj87j95r7n37hjcjqa,"[m(32,6)][m(40,14)][m(48,22)][m(56,30)][m(64,3...","615740494,615740502,615740518,615740535,615740...","[32,6][40,14][48,22][56,30][64,38][72,46][80,5...","[[40, 14], [48, 22], [56, 30], [64, 38], [72, ...",advanced,advanced_bot
296,v7qok0dv5ekcaoefquadgka6me,"[m(599,558)][m(581,545)][m(560,529)][m(536,510...","215301883,215301899,215301915,215301932,215301...","[599,558][581,545][560,529][536,510][513,492][...","[[581, 545], [560, 529], [536, 510], [513, 492...",advanced,human
297,v7qok0dv5ekcaoefquadgka6me,"[m(599,558)][m(581,545)][m(560,529)][m(536,510...","215301883,215301899,215301915,215301932,215301...","[599,558][581,545][560,529][536,510][513,492][...","[[581, 545], [560, 529], [536, 510], [513, 492...",advanced,human
298,trga4mrvjvogdgoi9vrqkmi92j,"[m(6,7)][m(14,15)][m(22,23)][m(29,31)][m(37,39...","619080239,619080248,619080262,619080278,619080...","[6,7][14,15][22,23][29,31][37,39][45,47][52,55...","[[14, 15], [22, 23], [29, 31], [37, 39], [45, ...",advanced,advanced_bot


# mouse p1 additional code

In [None]:
df = overall_mouseclicker_df.copy()

In [None]:
df

Unnamed: 0,session_id,total_behaviour,mousemove_times,mousemove_total_behaviour,mouse_movements,category,human_or_bot
0,0ht0u328t4mkgi01sp7mm07e01,"[m(0,5)][m(0,6)][m(0,7)][m(0,8)][m(0,9)][m(1,1...","447717248,447717265,447717281,447717298,447717...","[0,5][0,6][0,7][0,8][0,9][1,10][1,11][1,12][1,...","[[0, 6], [0, 7], [0, 8], [0, 9], [1, 10], [1, ...",moderate,moderate_bot
1,0p0kaqfmftrtfg0u12rkshrdi9,"[m(544,545)][m(544,544)][m(543,543)][m(541,542...","215772898,215772921,215772936,215772953,215772...","[544,545][544,544][543,543][541,542][540,541][...","[[544, 544], [543, 543], [541, 542], [540, 541...",moderate,human
2,0p0kaqfmftrtfg0u12rkshrdi9,"[m(544,545)][m(544,544)][m(543,543)][m(541,542...","215772898,215772921,215772936,215772953,215772...","[544,545][544,544][543,543][541,542][540,541][...","[[544, 544], [543, 543], [541, 542], [540, 541...",moderate,human
3,0ogg5l1i4nh37ek8mjpn8q9fc0,"[m(24,0)][m(24,1)][m(24,2)][m(24,3)][m(24,4)][...","443682927,443682939,443682955,443682972,443682...","[24,0][24,1][24,2][24,3][24,4][24,5][24,6][24,...","[[24, 1], [24, 2], [24, 3], [24, 4], [24, 5], ...",moderate,moderate_bot
4,0sejj3er3n0v1sfjv2jbnhaav4,"[m(0,4)][m(0,5)][m(0,6)][m(0,7)][m(1,8)][m(1,9...","448128668,448128685,448128702,448128718,448128...","[0,4][0,5][0,6][0,7][1,8][1,9][1,10][1,11][2,1...","[[0, 5], [0, 6], [0, 7], [1, 8], [1, 9], [1, 1...",moderate,moderate_bot
...,...,...,...,...,...,...,...
295,v6g56j1moj87j95r7n37hjcjqa,"[m(32,6)][m(40,14)][m(48,22)][m(56,30)][m(64,3...","615740494,615740502,615740518,615740535,615740...","[32,6][40,14][48,22][56,30][64,38][72,46][80,5...","[[40, 14], [48, 22], [56, 30], [64, 38], [72, ...",advanced,advanced_bot
296,v7qok0dv5ekcaoefquadgka6me,"[m(599,558)][m(581,545)][m(560,529)][m(536,510...","215301883,215301899,215301915,215301932,215301...","[599,558][581,545][560,529][536,510][513,492][...","[[581, 545], [560, 529], [536, 510], [513, 492...",advanced,human
297,v7qok0dv5ekcaoefquadgka6me,"[m(599,558)][m(581,545)][m(560,529)][m(536,510...","215301883,215301899,215301915,215301932,215301...","[599,558][581,545][560,529][536,510][513,492][...","[[581, 545], [560, 529], [536, 510], [513, 492...",advanced,human
298,trga4mrvjvogdgoi9vrqkmi92j,"[m(6,7)][m(14,15)][m(22,23)][m(29,31)][m(37,39...","619080239,619080248,619080262,619080278,619080...","[6,7][14,15][22,23][29,31][37,39][45,47][52,55...","[[14, 15], [22, 23], [29, 31], [37, 39], [45, ...",advanced,advanced_bot


In [None]:
# change mousemove_times to arr
df["mousemove_times"] = df["mousemove_times"].apply(lambda x: [int(t) for t in x.split(",") if t.strip()])
df

Unnamed: 0,session_id,total_behaviour,mousemove_times,mousemove_total_behaviour,mouse_movements,category,human_or_bot
0,0ht0u328t4mkgi01sp7mm07e01,"[m(0,5)][m(0,6)][m(0,7)][m(0,8)][m(0,9)][m(1,1...","[447717248, 447717265, 447717281, 447717298, 4...","[0,5][0,6][0,7][0,8][0,9][1,10][1,11][1,12][1,...","[[0, 6], [0, 7], [0, 8], [0, 9], [1, 10], [1, ...",moderate,moderate_bot
1,0p0kaqfmftrtfg0u12rkshrdi9,"[m(544,545)][m(544,544)][m(543,543)][m(541,542...","[215772898, 215772921, 215772936, 215772953, 2...","[544,545][544,544][543,543][541,542][540,541][...","[[544, 544], [543, 543], [541, 542], [540, 541...",moderate,human
2,0p0kaqfmftrtfg0u12rkshrdi9,"[m(544,545)][m(544,544)][m(543,543)][m(541,542...","[215772898, 215772921, 215772936, 215772953, 2...","[544,545][544,544][543,543][541,542][540,541][...","[[544, 544], [543, 543], [541, 542], [540, 541...",moderate,human
3,0ogg5l1i4nh37ek8mjpn8q9fc0,"[m(24,0)][m(24,1)][m(24,2)][m(24,3)][m(24,4)][...","[443682927, 443682939, 443682955, 443682972, 4...","[24,0][24,1][24,2][24,3][24,4][24,5][24,6][24,...","[[24, 1], [24, 2], [24, 3], [24, 4], [24, 5], ...",moderate,moderate_bot
4,0sejj3er3n0v1sfjv2jbnhaav4,"[m(0,4)][m(0,5)][m(0,6)][m(0,7)][m(1,8)][m(1,9...","[448128668, 448128685, 448128702, 448128718, 4...","[0,4][0,5][0,6][0,7][1,8][1,9][1,10][1,11][2,1...","[[0, 5], [0, 6], [0, 7], [1, 8], [1, 9], [1, 1...",moderate,moderate_bot
...,...,...,...,...,...,...,...
295,v6g56j1moj87j95r7n37hjcjqa,"[m(32,6)][m(40,14)][m(48,22)][m(56,30)][m(64,3...","[615740494, 615740502, 615740518, 615740535, 6...","[32,6][40,14][48,22][56,30][64,38][72,46][80,5...","[[40, 14], [48, 22], [56, 30], [64, 38], [72, ...",advanced,advanced_bot
296,v7qok0dv5ekcaoefquadgka6me,"[m(599,558)][m(581,545)][m(560,529)][m(536,510...","[215301883, 215301899, 215301915, 215301932, 2...","[599,558][581,545][560,529][536,510][513,492][...","[[581, 545], [560, 529], [536, 510], [513, 492...",advanced,human
297,v7qok0dv5ekcaoefquadgka6me,"[m(599,558)][m(581,545)][m(560,529)][m(536,510...","[215301883, 215301899, 215301915, 215301932, 2...","[599,558][581,545][560,529][536,510][513,492][...","[[581, 545], [560, 529], [536, 510], [513, 492...",advanced,human
298,trga4mrvjvogdgoi9vrqkmi92j,"[m(6,7)][m(14,15)][m(22,23)][m(29,31)][m(37,39...","[619080239, 619080248, 619080262, 619080278, 6...","[6,7][14,15][22,23][29,31][37,39][45,47][52,55...","[[14, 15], [22, 23], [29, 31], [37, 39], [45, ...",advanced,advanced_bot


In [None]:
def parse_behavior(behavior_str):
    movements = []
    actions = re.findall(r'\[(m\(\d+,\d+\))\]|\[(c\(\w\))\]', behavior_str)

    for move, click in actions:
        if move:
            movements.append(("m", move))
        elif click:
            movements.append(("c", click))

    return movements

df["parsed_behaviour"] = df["total_behaviour"].apply(parse_behavior)

df

Unnamed: 0,session_id,total_behaviour,mousemove_times,mousemove_total_behaviour,mouse_movements,category,human_or_bot,parsed_behaviour
0,0ht0u328t4mkgi01sp7mm07e01,"[m(0,5)][m(0,6)][m(0,7)][m(0,8)][m(0,9)][m(1,1...","[447717248, 447717265, 447717281, 447717298, 4...","[0,5][0,6][0,7][0,8][0,9][1,10][1,11][1,12][1,...","[[0, 6], [0, 7], [0, 8], [0, 9], [1, 10], [1, ...",moderate,moderate_bot,"[(m, m(0,5)), (m, m(0,6)), (m, m(0,7)), (m, m(..."
1,0p0kaqfmftrtfg0u12rkshrdi9,"[m(544,545)][m(544,544)][m(543,543)][m(541,542...","[215772898, 215772921, 215772936, 215772953, 2...","[544,545][544,544][543,543][541,542][540,541][...","[[544, 544], [543, 543], [541, 542], [540, 541...",moderate,human,"[(m, m(544,545)), (m, m(544,544)), (m, m(543,5..."
2,0p0kaqfmftrtfg0u12rkshrdi9,"[m(544,545)][m(544,544)][m(543,543)][m(541,542...","[215772898, 215772921, 215772936, 215772953, 2...","[544,545][544,544][543,543][541,542][540,541][...","[[544, 544], [543, 543], [541, 542], [540, 541...",moderate,human,"[(m, m(544,545)), (m, m(544,544)), (m, m(543,5..."
3,0ogg5l1i4nh37ek8mjpn8q9fc0,"[m(24,0)][m(24,1)][m(24,2)][m(24,3)][m(24,4)][...","[443682927, 443682939, 443682955, 443682972, 4...","[24,0][24,1][24,2][24,3][24,4][24,5][24,6][24,...","[[24, 1], [24, 2], [24, 3], [24, 4], [24, 5], ...",moderate,moderate_bot,"[(m, m(24,0)), (m, m(24,1)), (m, m(24,2)), (m,..."
4,0sejj3er3n0v1sfjv2jbnhaav4,"[m(0,4)][m(0,5)][m(0,6)][m(0,7)][m(1,8)][m(1,9...","[448128668, 448128685, 448128702, 448128718, 4...","[0,4][0,5][0,6][0,7][1,8][1,9][1,10][1,11][2,1...","[[0, 5], [0, 6], [0, 7], [1, 8], [1, 9], [1, 1...",moderate,moderate_bot,"[(m, m(0,4)), (m, m(0,5)), (m, m(0,6)), (m, m(..."
...,...,...,...,...,...,...,...,...
295,v6g56j1moj87j95r7n37hjcjqa,"[m(32,6)][m(40,14)][m(48,22)][m(56,30)][m(64,3...","[615740494, 615740502, 615740518, 615740535, 6...","[32,6][40,14][48,22][56,30][64,38][72,46][80,5...","[[40, 14], [48, 22], [56, 30], [64, 38], [72, ...",advanced,advanced_bot,"[(m, m(32,6)), (m, m(40,14)), (m, m(48,22)), (..."
296,v7qok0dv5ekcaoefquadgka6me,"[m(599,558)][m(581,545)][m(560,529)][m(536,510...","[215301883, 215301899, 215301915, 215301932, 2...","[599,558][581,545][560,529][536,510][513,492][...","[[581, 545], [560, 529], [536, 510], [513, 492...",advanced,human,"[(m, m(599,558)), (m, m(581,545)), (m, m(560,5..."
297,v7qok0dv5ekcaoefquadgka6me,"[m(599,558)][m(581,545)][m(560,529)][m(536,510...","[215301883, 215301899, 215301915, 215301932, 2...","[599,558][581,545][560,529][536,510][513,492][...","[[581, 545], [560, 529], [536, 510], [513, 492...",advanced,human,"[(m, m(599,558)), (m, m(581,545)), (m, m(560,5..."
298,trga4mrvjvogdgoi9vrqkmi92j,"[m(6,7)][m(14,15)][m(22,23)][m(29,31)][m(37,39...","[619080239, 619080248, 619080262, 619080278, 6...","[6,7][14,15][22,23][29,31][37,39][45,47][52,55...","[[14, 15], [22, 23], [29, 31], [37, 39], [45, ...",advanced,advanced_bot,"[(m, m(6,7)), (m, m(14,15)), (m, m(22,23)), (m..."


In [None]:
final_rows = []

for _, row in df.iterrows():
    session_id = row["session_id"]
    human_or_bot = row["human_or_bot"]
    times = row["mousemove_times"]
    behaviors = row["parsed_behaviour"]

    last_coord = None
    time_index = 0

    for action_type, action in behaviors:
        # movement
        if action_type == "m":  # movement
            coord = action[2:-1]
            last_coord = coord
            event_time = times[time_index]
            time_index += 1
        else:
            # click
            coord = last_coord  # click gets last movement coord
            event_time = times[min(time_index, len(times) - 1)]

        final_rows.append([session_id, event_time, action_type, coord, human_or_bot])

df_final = pd.DataFrame(final_rows, columns=["session_id", "mousemove_times", "mousemove_type", "mousemove_behaviour", "human_or_bot"])

df_final


Unnamed: 0,session_id,mousemove_times,mousemove_type,mousemove_behaviour,human_or_bot
0,0ht0u328t4mkgi01sp7mm07e01,447717248,m,05,moderate_bot
1,0ht0u328t4mkgi01sp7mm07e01,447717265,m,06,moderate_bot
2,0ht0u328t4mkgi01sp7mm07e01,447717281,m,07,moderate_bot
3,0ht0u328t4mkgi01sp7mm07e01,447717298,m,08,moderate_bot
4,0ht0u328t4mkgi01sp7mm07e01,447717315,m,09,moderate_bot
...,...,...,...,...,...
2714564,uojjl06k7ee9lk3pc48ppaan0a,620638428,m,49269,advanced_bot
2714565,uojjl06k7ee9lk3pc48ppaan0a,620638445,m,48269,advanced_bot
2714566,uojjl06k7ee9lk3pc48ppaan0a,620638461,m,47268,advanced_bot
2714567,uojjl06k7ee9lk3pc48ppaan0a,620638478,m,47268,advanced_bot


In [None]:
df_final

Unnamed: 0,session_id,mousemove_times,mousemove_type,mousemove_behaviour,human_or_bot
0,0ht0u328t4mkgi01sp7mm07e01,447717248,m,05,moderate_bot
1,0ht0u328t4mkgi01sp7mm07e01,447717265,m,06,moderate_bot
2,0ht0u328t4mkgi01sp7mm07e01,447717281,m,07,moderate_bot
3,0ht0u328t4mkgi01sp7mm07e01,447717298,m,08,moderate_bot
4,0ht0u328t4mkgi01sp7mm07e01,447717315,m,09,moderate_bot
...,...,...,...,...,...
2714564,uojjl06k7ee9lk3pc48ppaan0a,620638428,m,49269,advanced_bot
2714565,uojjl06k7ee9lk3pc48ppaan0a,620638445,m,48269,advanced_bot
2714566,uojjl06k7ee9lk3pc48ppaan0a,620638461,m,47268,advanced_bot
2714567,uojjl06k7ee9lk3pc48ppaan0a,620638478,m,47268,advanced_bot


In [None]:
# df_final.to_csv('/content/drive/MyDrive/DSA4263/df_final_p1.csv', index=False)

In [None]:
df_p1 = df_final.copy()
df_p1

Unnamed: 0,session_id,mousemove_times,mousemove_type,mousemove_behaviour,human_or_bot
0,0ht0u328t4mkgi01sp7mm07e01,447717248,m,05,moderate_bot
1,0ht0u328t4mkgi01sp7mm07e01,447717265,m,06,moderate_bot
2,0ht0u328t4mkgi01sp7mm07e01,447717281,m,07,moderate_bot
3,0ht0u328t4mkgi01sp7mm07e01,447717298,m,08,moderate_bot
4,0ht0u328t4mkgi01sp7mm07e01,447717315,m,09,moderate_bot
...,...,...,...,...,...
2714564,uojjl06k7ee9lk3pc48ppaan0a,620638428,m,49269,advanced_bot
2714565,uojjl06k7ee9lk3pc48ppaan0a,620638445,m,48269,advanced_bot
2714566,uojjl06k7ee9lk3pc48ppaan0a,620638461,m,47268,advanced_bot
2714567,uojjl06k7ee9lk3pc48ppaan0a,620638478,m,47268,advanced_bot


In [None]:
df_p1 = df_p1.rename(columns={"human_or_bot": "label"})
df_p1

Unnamed: 0,session_id,mousemove_times,mousemove_type,mousemove_behaviour,label
0,0ht0u328t4mkgi01sp7mm07e01,447717248,m,05,moderate_bot
1,0ht0u328t4mkgi01sp7mm07e01,447717265,m,06,moderate_bot
2,0ht0u328t4mkgi01sp7mm07e01,447717281,m,07,moderate_bot
3,0ht0u328t4mkgi01sp7mm07e01,447717298,m,08,moderate_bot
4,0ht0u328t4mkgi01sp7mm07e01,447717315,m,09,moderate_bot
...,...,...,...,...,...
2714564,uojjl06k7ee9lk3pc48ppaan0a,620638428,m,49269,advanced_bot
2714565,uojjl06k7ee9lk3pc48ppaan0a,620638445,m,48269,advanced_bot
2714566,uojjl06k7ee9lk3pc48ppaan0a,620638461,m,47268,advanced_bot
2714567,uojjl06k7ee9lk3pc48ppaan0a,620638478,m,47268,advanced_bot


In [None]:
df_p1['mousemove_behaviour'] = df_p1['mousemove_behaviour'].apply(lambda x: f"({x})")
df_p1

Unnamed: 0,session_id,mousemove_times,mousemove_type,mousemove_behaviour,label
0,0ht0u328t4mkgi01sp7mm07e01,447717248,m,"(0,5)",moderate_bot
1,0ht0u328t4mkgi01sp7mm07e01,447717265,m,"(0,6)",moderate_bot
2,0ht0u328t4mkgi01sp7mm07e01,447717281,m,"(0,7)",moderate_bot
3,0ht0u328t4mkgi01sp7mm07e01,447717298,m,"(0,8)",moderate_bot
4,0ht0u328t4mkgi01sp7mm07e01,447717315,m,"(0,9)",moderate_bot
...,...,...,...,...,...
2714564,uojjl06k7ee9lk3pc48ppaan0a,620638428,m,"(49,269)",advanced_bot
2714565,uojjl06k7ee9lk3pc48ppaan0a,620638445,m,"(48,269)",advanced_bot
2714566,uojjl06k7ee9lk3pc48ppaan0a,620638461,m,"(47,268)",advanced_bot
2714567,uojjl06k7ee9lk3pc48ppaan0a,620638478,m,"(47,268)",advanced_bot


In [None]:
df_p1['mousemove_client_height_width'] = None
df_p1

Unnamed: 0,session_id,mousemove_times,mousemove_type,mousemove_behaviour,label,mousemove_client_height_width
0,0ht0u328t4mkgi01sp7mm07e01,447717248,m,"(0,5)",moderate_bot,
1,0ht0u328t4mkgi01sp7mm07e01,447717265,m,"(0,6)",moderate_bot,
2,0ht0u328t4mkgi01sp7mm07e01,447717281,m,"(0,7)",moderate_bot,
3,0ht0u328t4mkgi01sp7mm07e01,447717298,m,"(0,8)",moderate_bot,
4,0ht0u328t4mkgi01sp7mm07e01,447717315,m,"(0,9)",moderate_bot,
...,...,...,...,...,...,...
2714564,uojjl06k7ee9lk3pc48ppaan0a,620638428,m,"(49,269)",advanced_bot,
2714565,uojjl06k7ee9lk3pc48ppaan0a,620638445,m,"(48,269)",advanced_bot,
2714566,uojjl06k7ee9lk3pc48ppaan0a,620638461,m,"(47,268)",advanced_bot,
2714567,uojjl06k7ee9lk3pc48ppaan0a,620638478,m,"(47,268)",advanced_bot,


# mouse p2

In [None]:
directory_p2 = "/content/drive/MyDrive/DSA4263"
excel_path = f"{directory_p2}/transformed_data_phase2.xlsx"

sheets = pd.read_excel(excel_path, sheet_name=None)

df_p2 = pd.concat(sheets.values(), ignore_index=True)

df_p2

Unnamed: 0.1,Unnamed: 0,_id,session_id,mousemove_client_height_width,mousemove_times,mousemove_type,mousemove_height_width,label
0,0,5fbe4cf16b027f7d73668432,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057465,m,"(7,6)",advanced_bot
1,1,5fbe4cf16b027f7d73668432,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057476,m,"(15,14)",advanced_bot
2,2,5fbe4cf16b027f7d73668432,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057489,m,"(23,21)",advanced_bot
3,3,5fbe4cf16b027f7d73668432,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057505,m,"(31,29)",advanced_bot
4,4,5fbe4cf16b027f7d73668432,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057521,m,"(39,37)",advanced_bot
...,...,...,...,...,...,...,...,...
2536133,4899,5fc630e6cb842de741405adb,rh5fn2njcglhb2avkv30id017u,"(639, 1354)",1595505964115,m,"(1082,94)",human
2536134,4900,5fc630e6cb842de741405adb,rh5fn2njcglhb2avkv30id017u,"(639, 1354)",1595505964132,m,"(1140,54)",human
2536135,4901,5fc630e6cb842de741405adb,rh5fn2njcglhb2avkv30id017u,"(639, 1354)",1595505964149,m,"(1182,26)",human
2536136,4902,5fc630e6cb842de741405adb,rh5fn2njcglhb2avkv30id017u,"(639, 1354)",1595505964166,m,"(1214,8)",human


In [None]:
df_p2 = df_p2.drop(columns=["Unnamed: 0", "_id"], errors="ignore")
df_p2

Unnamed: 0,session_id,mousemove_client_height_width,mousemove_times,mousemove_type,mousemove_height_width,label
0,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057465,m,"(7,6)",advanced_bot
1,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057476,m,"(15,14)",advanced_bot
2,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057489,m,"(23,21)",advanced_bot
3,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057505,m,"(31,29)",advanced_bot
4,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057521,m,"(39,37)",advanced_bot
...,...,...,...,...,...,...
2536133,rh5fn2njcglhb2avkv30id017u,"(639, 1354)",1595505964115,m,"(1082,94)",human
2536134,rh5fn2njcglhb2avkv30id017u,"(639, 1354)",1595505964132,m,"(1140,54)",human
2536135,rh5fn2njcglhb2avkv30id017u,"(639, 1354)",1595505964149,m,"(1182,26)",human
2536136,rh5fn2njcglhb2avkv30id017u,"(639, 1354)",1595505964166,m,"(1214,8)",human


In [None]:
df_p2 = df_p2.rename(columns={"mousemove_height_width": "mousemove_behaviour"})
df_p2

Unnamed: 0,session_id,mousemove_client_height_width,mousemove_times,mousemove_type,mousemove_behaviour,label
0,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057465,m,"(7,6)",advanced_bot
1,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057476,m,"(15,14)",advanced_bot
2,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057489,m,"(23,21)",advanced_bot
3,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057505,m,"(31,29)",advanced_bot
4,h2jo3uq1om8vp81nopaa17kegu,"(847, 1813)",1606307057521,m,"(39,37)",advanced_bot
...,...,...,...,...,...,...
2536133,rh5fn2njcglhb2avkv30id017u,"(639, 1354)",1595505964115,m,"(1082,94)",human
2536134,rh5fn2njcglhb2avkv30id017u,"(639, 1354)",1595505964132,m,"(1140,54)",human
2536135,rh5fn2njcglhb2avkv30id017u,"(639, 1354)",1595505964149,m,"(1182,26)",human
2536136,rh5fn2njcglhb2avkv30id017u,"(639, 1354)",1595505964166,m,"(1214,8)",human


In [None]:
df_p2 = df_p2[['session_id', 'mousemove_times', 'mousemove_type', 'mousemove_behaviour','label', 'mousemove_client_height_width' ]]
df_p2

Unnamed: 0,session_id,mousemove_times,mousemove_type,mousemove_behaviour,label,mousemove_client_height_width
0,h2jo3uq1om8vp81nopaa17kegu,1606307057465,m,"(7,6)",advanced_bot,"(847, 1813)"
1,h2jo3uq1om8vp81nopaa17kegu,1606307057476,m,"(15,14)",advanced_bot,"(847, 1813)"
2,h2jo3uq1om8vp81nopaa17kegu,1606307057489,m,"(23,21)",advanced_bot,"(847, 1813)"
3,h2jo3uq1om8vp81nopaa17kegu,1606307057505,m,"(31,29)",advanced_bot,"(847, 1813)"
4,h2jo3uq1om8vp81nopaa17kegu,1606307057521,m,"(39,37)",advanced_bot,"(847, 1813)"
...,...,...,...,...,...,...
2536133,rh5fn2njcglhb2avkv30id017u,1595505964115,m,"(1082,94)",human,"(639, 1354)"
2536134,rh5fn2njcglhb2avkv30id017u,1595505964132,m,"(1140,54)",human,"(639, 1354)"
2536135,rh5fn2njcglhb2avkv30id017u,1595505964149,m,"(1182,26)",human,"(639, 1354)"
2536136,rh5fn2njcglhb2avkv30id017u,1595505964166,m,"(1214,8)",human,"(639, 1354)"


# df_mouse_p1p2 merge

In [None]:
# merge p1 and p2
df_mouse_p1p2 = pd.concat([df_p1, df_p2], ignore_index=True)
df_mouse_p1p2

In [None]:
# df_mouse_p1p2.to_csv("/content/drive/MyDrive/DSA4263/df_mouse_p1p2.csv", index=False)

# weblog_p1p2 merge

In [None]:
weblog_p1 = pd.read_csv("/content/drive/MyDrive/DSA4263/web_log/web_log_phase1.csv")
weblog_p2 = pd.read_csv("/content/drive/MyDrive/DSA4263/web_log/web_log_phase2.csv")

weblog_p1p2 = pd.concat([weblog_p1, weblog_p2], ignore_index=True)

weblog_p1p2

Unnamed: 0,datetime,method,url,protocol,status,byte_size,referrer,Session_ID,user_agent,category
0,2019-10-24 07:45:52,GET,/,HTTP/1.1,200,2712,-,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
1,2019-10-24 07:45:52,GET,/css/main.css,HTTP/1.1,200,764,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
2,2019-10-24 07:45:52,GET,/js/initialise_vars.js,HTTP/1.1,200,770,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
3,2019-10-24 07:45:52,GET,/js/cookies_functions.js,HTTP/1.1,200,1011,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
4,2019-10-24 07:45:52,GET,/js/mousemove_onclick.js,HTTP/1.1,200,2167,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
...,...,...,...,...,...,...,...,...,...,...
305367,2020-11-25 21:16:22,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,316,http://192.168.1.120/content/cryptocurrency.php,cs2g8i2bgh2nc7i5eiq23jp9lv,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,bots
305368,2020-11-25 21:16:23,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,316,http://192.168.1.120/content/cryptocurrency.php,cs2g8i2bgh2nc7i5eiq23jp9lv,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,bots
305369,2020-11-25 21:16:24,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,316,http://192.168.1.120/content/cryptocurrency.php,cs2g8i2bgh2nc7i5eiq23jp9lv,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,bots
305370,2020-11-25 21:16:24,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,316,http://192.168.1.120/content/cryptocurrency.php,cs2g8i2bgh2nc7i5eiq23jp9lv,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,bots


In [None]:
weblog_p1p2.to_csv("/content/drive/MyDrive/DSA4263/weblog_p1p2.csv", index=False)

# mouse_weblog_merge: mouse df with some columns from weblog df


if u have enough ram you can try to merge this

In [None]:
# to read in again
weblog_p1p2 = pd.read_csv("/content/drive/MyDrive/DSA4263/weblog_p1p2.csv")

weblog_p1p2

Unnamed: 0,datetime,method,url,protocol,status,byte_size,referrer,Session_ID,user_agent,category
0,2019-10-24 07:45:52,GET,/,HTTP/1.1,200,2712,-,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
1,2019-10-24 07:45:52,GET,/css/main.css,HTTP/1.1,200,764,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
2,2019-10-24 07:45:52,GET,/js/initialise_vars.js,HTTP/1.1,200,770,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
3,2019-10-24 07:45:52,GET,/js/cookies_functions.js,HTTP/1.1,200,1011,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
4,2019-10-24 07:45:52,GET,/js/mousemove_onclick.js,HTTP/1.1,200,2167,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
...,...,...,...,...,...,...,...,...,...,...
305367,2020-11-25 21:16:22,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,316,http://192.168.1.120/content/cryptocurrency.php,cs2g8i2bgh2nc7i5eiq23jp9lv,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,bots
305368,2020-11-25 21:16:23,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,316,http://192.168.1.120/content/cryptocurrency.php,cs2g8i2bgh2nc7i5eiq23jp9lv,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,bots
305369,2020-11-25 21:16:24,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,316,http://192.168.1.120/content/cryptocurrency.php,cs2g8i2bgh2nc7i5eiq23jp9lv,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,bots
305370,2020-11-25 21:16:24,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,316,http://192.168.1.120/content/cryptocurrency.php,cs2g8i2bgh2nc7i5eiq23jp9lv,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,bots


In [None]:
weblog_p1p2_distinct = weblog_p1p2.copy()
weblog_p1p2_distinct

Unnamed: 0,datetime,method,url,protocol,status,byte_size,referrer,Session_ID,user_agent,category
0,2019-10-24 07:45:52,GET,/,HTTP/1.1,200,2712,-,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
1,2019-10-24 07:45:52,GET,/css/main.css,HTTP/1.1,200,764,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
2,2019-10-24 07:45:52,GET,/js/initialise_vars.js,HTTP/1.1,200,770,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
3,2019-10-24 07:45:52,GET,/js/cookies_functions.js,HTTP/1.1,200,1011,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
4,2019-10-24 07:45:52,GET,/js/mousemove_onclick.js,HTTP/1.1,200,2167,https://160.40.52.164/,97hf7ciplt2k54f5j6109nekn0,Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69....,humans
...,...,...,...,...,...,...,...,...,...,...
305367,2020-11-25 21:16:22,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,316,http://192.168.1.120/content/cryptocurrency.php,cs2g8i2bgh2nc7i5eiq23jp9lv,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,bots
305368,2020-11-25 21:16:23,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,316,http://192.168.1.120/content/cryptocurrency.php,cs2g8i2bgh2nc7i5eiq23jp9lv,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,bots
305369,2020-11-25 21:16:24,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,316,http://192.168.1.120/content/cryptocurrency.php,cs2g8i2bgh2nc7i5eiq23jp9lv,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,bots
305370,2020-11-25 21:16:24,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,316,http://192.168.1.120/content/cryptocurrency.php,cs2g8i2bgh2nc7i5eiq23jp9lv,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,bots


In [None]:
# for each session_id, the mean status_mode, user_agent, and mean byte_size
weblog_p1p2_distinct2 = weblog_p1p2_distinct.groupby('Session_ID').agg(
    status_mode=('status', lambda x: x.mode()[0]),
    user_agent=('user_agent', 'first'),
    avg_byte_size=('byte_size', 'mean')
).reset_index()

weblog_p1p2_distinct2 = weblog_p1p2_distinct2[1:]
weblog_p1p2_distinct2

Unnamed: 0,Session_ID,status_mode,user_agent,avg_byte_size
1,01o7p78e2bnu1814jn5k4uqke4,200,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,843.406250
2,03jt2p4bdru20sjb9me2gco6j4,200,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,443.734671
3,0654rvnjhnr0pvsi3qa3e16avo,200,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,651.146341
4,06ivkemfgn93qhl5j0vu96rnl4,200,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:7...,599.692308
5,071tbv7fsev5d64kb0f9jieor6,200,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,605.326683
...,...,...,...,...
809,vqdvioip730lq32umqa85ikehl,200,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,628.471642
810,vqrt3maidth9lr4df2egocd88g,200,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,947.918919
811,vtcjrbtjq57mnai4banl61pd25,200,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,594.541779
812,vu3fio88psda005g91fbjona0v,200,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,552.520833


In [None]:
# read in again
df_mouse_p1p2 = pd.read_csv('/content/drive/MyDrive/DSA4263/df_mouse_p1p2.csv')

df_mouse_p1p2

  df_mouse_p1p2 = pd.read_csv('/content/drive/MyDrive/DSA4263/df_mouse_p1p2.csv')


Unnamed: 0,session_id,mousemove_times,mousemove_type,mousemove_behaviour,label,mousemove_client_height_width
0,0ht0u328t4mkgi01sp7mm07e01,447717248,m,"(0,5)",moderate_bot,
1,0ht0u328t4mkgi01sp7mm07e01,447717265,m,"(0,6)",moderate_bot,
2,0ht0u328t4mkgi01sp7mm07e01,447717281,m,"(0,7)",moderate_bot,
3,0ht0u328t4mkgi01sp7mm07e01,447717298,m,"(0,8)",moderate_bot,
4,0ht0u328t4mkgi01sp7mm07e01,447717315,m,"(0,9)",moderate_bot,
...,...,...,...,...,...,...
5250702,rh5fn2njcglhb2avkv30id017u,1595505964115,m,"(1082,94)",human,"(639, 1354)"
5250703,rh5fn2njcglhb2avkv30id017u,1595505964132,m,"(1140,54)",human,"(639, 1354)"
5250704,rh5fn2njcglhb2avkv30id017u,1595505964149,m,"(1182,26)",human,"(639, 1354)"
5250705,rh5fn2njcglhb2avkv30id017u,1595505964166,m,"(1214,8)",human,"(639, 1354)"


In [None]:
#df_mouse_p1p2

In [None]:
# fully merge with possible columns
mouse_weblog_merge = df_mouse_p1p2.merge(weblog_p1p2, left_on='session_id', right_on='Session_ID', how='left')
mouse_weblog_merge

In [None]:
# mouse_weblog_merge.to_csv("/content/drive/MyDrive/DSA4263/mouse_weblog_merge.csv", index=False)