In [130]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import hashlib

In [131]:
# load features datasets 

weblog_features = pd.read_csv("../data/interim/web_log_features.csv")
mouseclick_frequency = pd.read_csv("../data/interim/mouseclick_frequency.csv")
mouseclick_entropy = pd.read_csv("../data/interim/mouseclick_entropy.csv")
mousemovement_features = pd.read_csv("../data/interim/mousemovement_features.csv")

In [132]:
weblog_features = weblog_features.rename(columns={'Session_ID':"session_id"})
mousemovement_features = mousemovement_features.drop(columns='Unnamed: 0')

In [133]:
weblog_features

Unnamed: 0,session_id,time_afternoon,time_evening,time_morning,time_night,time_wee_hours
0,01o7p78e2bnu1814jn5k4uqke4,1,0,0,0,0
1,03jt2p4bdru20sjb9me2gco6j4,0,0,1,0,0
2,0654rvnjhnr0pvsi3qa3e16avo,1,0,0,0,0
3,06ivkemfgn93qhl5j0vu96rnl4,0,0,1,0,0
4,071tbv7fsev5d64kb0f9jieor6,0,1,0,0,0
...,...,...,...,...,...,...
808,vqdvioip730lq32umqa85ikehl,0,0,0,0,1
809,vqrt3maidth9lr4df2egocd88g,1,0,0,0,0
810,vtcjrbtjq57mnai4banl61pd25,1,1,0,0,0
811,vu3fio88psda005g91fbjona0v,0,0,1,0,0


In [134]:
mouseclick_frequency

Unnamed: 0,session_id,clicks_count,durations,click_frequency
0,03jt2p4bdru20sjb9me2gco6j4,10,1606000878926,6.226647e-12
1,071tbv7fsev5d64kb0f9jieor6,19,677103,2.806072e-05
2,094i85crhkpkhqpi3rl4athrn4,17,1606000279969,1.058530e-11
3,0ht0u328t4mkgi01sp7mm07e01,10,141246,7.079847e-05
4,0i5kvpslrq3vb6u8ff2kuejv0v,16,460995,3.470753e-05
...,...,...,...,...
444,vmgkfqtair7fq7kqqr6kpqm9st,19,1606000127425,1.183063e-11
445,vofhhjsbkpj6kc9it95e76nu1a,376,1606020467164,2.341191e-10
446,vopb1c4o3o2dpsov8jinbbou5h,14,544627,2.570567e-05
447,vtcjrbtjq57mnai4banl61pd25,19,632030,3.006186e-05


In [135]:
mouseclick_entropy

Unnamed: 0,session_id,entropy
0,03jt2p4bdru20sjb9me2gco6j4,3.121928
1,071tbv7fsev5d64kb0f9jieor6,3.221097
2,094i85crhkpkhqpi3rl4athrn4,2.895424
3,0ht0u328t4mkgi01sp7mm07e01,2.446439
4,0i5kvpslrq3vb6u8ff2kuejv0v,3.452820
...,...,...
444,vmgkfqtair7fq7kqqr6kpqm9st,2.694781
445,vofhhjsbkpj6kc9it95e76nu1a,4.626272
446,vopb1c4o3o2dpsov8jinbbou5h,3.521641
447,vtcjrbtjq57mnai4banl61pd25,3.110578


In [136]:
mousemovement_features

Unnamed: 0,session_id,speed_cv,hover_frequency,avg_hover_duration,acceleration_mean,acceleration_std,acceleration_skewness,acceleration_pos_neg_ratio,direction_change_frequency,idle_active_ratio,label
0,03jt2p4bdru20sjb9me2gco6j4,1.292750,3.265045,10044.058824,-97.987654,4076.554706,0.137302,0.835740,0.550618,4.248230e+07,human
1,071tbv7fsev5d64kb0f9jieor6,0.509696,0.965173,1951.275000,-3.079521,1211.355072,-0.234733,1.013543,0.235069,2.274826e+00,advanced_bot
2,094i85crhkpkhqpi3rl4athrn4,0.517826,0.218432,966.923077,-94.009643,2813.920800,-0.017841,0.936106,0.246734,7.828571e+06,advanced_bot
3,0ht0u328t4mkgi01sp7mm07e01,0.232604,3.927866,866.327044,-2.978588,616.098440,0.029983,0.984024,33.136932,1.028202e+00,moderate_bot
4,0i5kvpslrq3vb6u8ff2kuejv0v,0.508165,0.895245,2000.883117,-0.205002,1173.942929,-0.054982,1.024497,0.245092,2.184613e+00,advanced_bot
...,...,...,...,...,...,...,...,...,...,...,...
444,vmgkfqtair7fq7kqqr6kpqm9st,0.305750,3.814194,411.760714,-22.634376,608.311077,0.822027,0.927207,13.736547,1.239705e+07,moderate_bot
445,vofhhjsbkpj6kc9it95e76nu1a,1.410842,2.760047,2101.642398,-227.235192,3651.905852,0.074954,0.812197,0.914448,3.544257e+06,human
446,vopb1c4o3o2dpsov8jinbbou5h,1.315861,0.447232,946.047170,-169.103916,4466.405098,0.044590,0.905222,0.022857,2.783891e-01,human
447,vtcjrbtjq57mnai4banl61pd25,0.498910,0.329584,2523.695652,1.611909,1139.847492,-0.085814,1.009751,0.236819,1.687842e+00,advanced_bot


In [137]:
combined_df = mouseclick_entropy.merge(mouseclick_frequency, on = 'session_id', how = 'outer').merge(mousemovement_features, on = 'session_id', how = 'outer').merge(weblog_features, on = 'session_id', how = 'left') 
# combined = weblog_features.merge(mouseclick_frequency, on = 'session_id', how = 'outer').merge(mouseclick_entropy, on = 'session_id', how = 'outer') 

In [138]:
combined_df['new_label'] = combined_df['label'].apply(lambda x: 'bot' if x in ['moderate_bot', 'advanced_bot'] else 'human')

In [139]:
combined_df

Unnamed: 0,session_id,entropy,clicks_count,durations,click_frequency,speed_cv,hover_frequency,avg_hover_duration,acceleration_mean,acceleration_std,...,acceleration_pos_neg_ratio,direction_change_frequency,idle_active_ratio,label,time_afternoon,time_evening,time_morning,time_night,time_wee_hours,new_label
0,03jt2p4bdru20sjb9me2gco6j4,3.121928,10,1606000878926,6.226647e-12,1.292750,3.265045,10044.058824,-97.987654,4076.554706,...,0.835740,0.550618,4.248230e+07,human,0,0,1,0,0,human
1,071tbv7fsev5d64kb0f9jieor6,3.221097,19,677103,2.806072e-05,0.509696,0.965173,1951.275000,-3.079521,1211.355072,...,1.013543,0.235069,2.274826e+00,advanced_bot,0,1,0,0,0,bot
2,094i85crhkpkhqpi3rl4athrn4,2.895424,17,1606000279969,1.058530e-11,0.517826,0.218432,966.923077,-94.009643,2813.920800,...,0.936106,0.246734,7.828571e+06,advanced_bot,0,0,0,1,0,bot
3,0ht0u328t4mkgi01sp7mm07e01,2.446439,10,141246,7.079847e-05,0.232604,3.927866,866.327044,-2.978588,616.098440,...,0.984024,33.136932,1.028202e+00,moderate_bot,1,0,0,0,0,bot
4,0i5kvpslrq3vb6u8ff2kuejv0v,3.452820,16,460995,3.470753e-05,0.508165,0.895245,2000.883117,-0.205002,1173.942929,...,1.024497,0.245092,2.184613e+00,advanced_bot,0,1,0,0,0,bot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,vmgkfqtair7fq7kqqr6kpqm9st,2.694781,19,1606000127425,1.183063e-11,0.305750,3.814194,411.760714,-22.634376,608.311077,...,0.927207,13.736547,1.239705e+07,moderate_bot,0,0,1,0,0,bot
445,vofhhjsbkpj6kc9it95e76nu1a,4.626272,376,1606020467164,2.341191e-10,1.410842,2.760047,2101.642398,-227.235192,3651.905852,...,0.812197,0.914448,3.544257e+06,human,1,1,0,0,0,human
446,vopb1c4o3o2dpsov8jinbbou5h,3.521641,14,544627,2.570567e-05,1.315861,0.447232,946.047170,-169.103916,4466.405098,...,0.905222,0.022857,2.783891e-01,human,0,0,1,0,0,human
447,vtcjrbtjq57mnai4banl61pd25,3.110578,19,632030,3.006186e-05,0.498910,0.329584,2523.695652,1.611909,1139.847492,...,1.009751,0.236819,1.687842e+00,advanced_bot,1,1,0,0,0,bot


In [140]:
combined_df.to_csv('../data/processed/data.csv', index=False)

Get target column

In [141]:
# weblog_p1 = pd.read_csv('../data/interim/web_log_phase1.csv')
# weblog_p2 = pd.read_csv('../data/interim/web_log_phase2.csv')

In [142]:
# weblog_all = pd.concat([weblog_p1, weblog_p2])

In [143]:
# weblog_all = weblog_all.rename(columns = {"Session_ID": "session_id"})

In [144]:
# target = weblog_all[['session_id', 'category']].drop_duplicates()

In [145]:
# combined_df = combined.merge(target, on = 'session_id', how = 'left')

In [146]:
# combined_df.fillna(0, inplace = True)

In [147]:
# combined_df

Try creating a hashing function to hash URLs. Create a lookup table for each URL as KNN only works with numerical values.

In [148]:

# def hash_url(url):
#     # Use SHA-256 to generate a hash of the URL
#     hash_object = hashlib.sha256(url.encode('utf-8'))
#     # Convert the hash to a hexadecimal string
#     hex_hash = hash_object.hexdigest()
#     # Take the first 'length' characters of the hex string and convert to integer
#     short_hash = int(hex_hash[:10], 16)
#     return short_hash

# url_lookup = combined_df[['referrer']]
# url_lookup['hashed_url'] = url_lookup['referrer'].apply(hash_url)

In [149]:
# url_lookup

In [150]:
# combined_df['hashed_url'] = combined_df['referrer'].apply(hash_url)

In [151]:
combined_df

Unnamed: 0,session_id,entropy,clicks_count,durations,click_frequency,speed_cv,hover_frequency,avg_hover_duration,acceleration_mean,acceleration_std,...,acceleration_pos_neg_ratio,direction_change_frequency,idle_active_ratio,label,time_afternoon,time_evening,time_morning,time_night,time_wee_hours,new_label
0,03jt2p4bdru20sjb9me2gco6j4,3.121928,10,1606000878926,6.226647e-12,1.292750,3.265045,10044.058824,-97.987654,4076.554706,...,0.835740,0.550618,4.248230e+07,human,0,0,1,0,0,human
1,071tbv7fsev5d64kb0f9jieor6,3.221097,19,677103,2.806072e-05,0.509696,0.965173,1951.275000,-3.079521,1211.355072,...,1.013543,0.235069,2.274826e+00,advanced_bot,0,1,0,0,0,bot
2,094i85crhkpkhqpi3rl4athrn4,2.895424,17,1606000279969,1.058530e-11,0.517826,0.218432,966.923077,-94.009643,2813.920800,...,0.936106,0.246734,7.828571e+06,advanced_bot,0,0,0,1,0,bot
3,0ht0u328t4mkgi01sp7mm07e01,2.446439,10,141246,7.079847e-05,0.232604,3.927866,866.327044,-2.978588,616.098440,...,0.984024,33.136932,1.028202e+00,moderate_bot,1,0,0,0,0,bot
4,0i5kvpslrq3vb6u8ff2kuejv0v,3.452820,16,460995,3.470753e-05,0.508165,0.895245,2000.883117,-0.205002,1173.942929,...,1.024497,0.245092,2.184613e+00,advanced_bot,0,1,0,0,0,bot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,vmgkfqtair7fq7kqqr6kpqm9st,2.694781,19,1606000127425,1.183063e-11,0.305750,3.814194,411.760714,-22.634376,608.311077,...,0.927207,13.736547,1.239705e+07,moderate_bot,0,0,1,0,0,bot
445,vofhhjsbkpj6kc9it95e76nu1a,4.626272,376,1606020467164,2.341191e-10,1.410842,2.760047,2101.642398,-227.235192,3651.905852,...,0.812197,0.914448,3.544257e+06,human,1,1,0,0,0,human
446,vopb1c4o3o2dpsov8jinbbou5h,3.521641,14,544627,2.570567e-05,1.315861,0.447232,946.047170,-169.103916,4466.405098,...,0.905222,0.022857,2.783891e-01,human,0,0,1,0,0,human
447,vtcjrbtjq57mnai4banl61pd25,3.110578,19,632030,3.006186e-05,0.498910,0.329584,2523.695652,1.611909,1139.847492,...,1.009751,0.236819,1.687842e+00,advanced_bot,1,1,0,0,0,bot


In [152]:
X = combined_df[[column for column in list(combined_df.columns) if column != 'label' and column != 'new_label' and column != 'session_id' and column != 'url' and column != 'referrer']]
y = combined_df[['new_label']]

In [153]:
# Split dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features (important for KNN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'KNN Accuracy: {accuracy:.2f}')

KNN Accuracy: 1.00


  return self._fit(X, y)


In [154]:
print(len(combined_df[combined_df['new_label']== 'human']))
print(len(combined_df[combined_df['new_label']== 'bot']))

109
340


In [155]:
print(len(combined_df[combined_df['label']== 'human']))
print(len(combined_df[combined_df['label']== 'moderate_bot']))
print(len(combined_df[combined_df['label']== 'advanced_bot']))

109
170
170


In [156]:
# Rebuild X_test into a DataFrame (if it became a numpy array after scaling)
X_test_df = pd.DataFrame(X_test, columns=X.columns)

# Create the full combined DataFrame
knn_combined_df = X_test_df.copy()

# Add KNN prediction and true label
knn_combined_df['KNN Prediction'] = y_pred
knn_combined_df['True Label'] = y_test.values.flatten()

# Display first few rows
knn_combined_df.head()

Unnamed: 0,entropy,clicks_count,durations,click_frequency,speed_cv,hover_frequency,avg_hover_duration,acceleration_mean,acceleration_std,acceleration_skewness,acceleration_pos_neg_ratio,direction_change_frequency,idle_active_ratio,time_afternoon,time_evening,time_morning,time_night,time_wee_hours,KNN Prediction,True Label
0,-0.655538,-0.156923,-1.429112,1.397659,-0.761518,0.924815,-0.005883,0.902351,-1.024863,0.12338,1.224996,0.744609,-0.857226,1.351581,-0.456896,-0.744387,-0.420772,-0.118846,bot,bot
1,0.698889,-0.266259,-1.429112,0.872302,1.452376,-0.934238,-0.367982,0.069175,1.637779,-0.184886,-0.657948,-0.656709,-0.857226,-0.739874,-0.456896,-0.744387,2.376583,-0.118846,human,human
2,-1.38536,-0.320927,0.700444,-0.46504,-0.688507,1.051944,-0.49403,0.702442,-1.060693,2.558336,0.417644,0.460473,1.634258,-0.739874,-0.456896,1.343387,-0.420772,-0.118846,bot,bot
3,-0.54161,-0.238925,-1.429111,0.338231,-0.303115,-0.522901,0.048134,0.786227,-0.666761,-1.411023,0.582453,-0.638967,-0.857226,1.351581,-0.456896,-0.744387,-0.420772,-0.118846,bot,bot
4,-1.716048,-0.320927,0.700444,-0.46504,-1.063119,1.10762,-0.495586,0.820646,-1.173348,1.876954,0.236468,-0.109728,1.608131,-0.739874,-0.456896,1.343387,-0.420772,-0.118846,bot,bot
