In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [46]:
# Basic statistics
def analyze_requests(df):
    print("Total Requests:", len(df))
    print("Unique Sessions:", df['Session_ID'].nunique())
    print("Categories:", df['category'].value_counts())

# Compare humans vs bots
def compare_humans_bots(df):
    df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
    df = df.sort_values(by=['Session_ID', 'datetime'])
    human_df = df[df['category'] == 'humans']
    bot_df = df[df['category'] == 'bots']
    
    print("\nAverage Requests per Session:")
    print("Humans:", human_df.groupby('Session_ID').size().mean())
    print("Bots:", bot_df.groupby('Session_ID').size().mean())

    print("\nAverage Duration Between Requests:")
    human_time_diff = []
    for _, group in human_df.groupby('Session_ID'):
        time_diff = group['datetime'].diff().dropna()  
        human_time_diff.extend(time_diff)
    bot_time_diff = []
    for _, group in bot_df.groupby('Session_ID'):
        time_diff = group['datetime'].diff().dropna()  
        bot_time_diff.extend(time_diff)
    print("Humans:", pd.to_timedelta(human_time_diff).mean())
    print("Bots:", pd.to_timedelta(bot_time_diff).mean())

    print("\nRequest Type Distribution:")
    print("Humans:", (human_df['method'].value_counts(normalize=True) * 100).round(2))
    print("Bots:", (bot_df['method'].value_counts(normalize=True) * 100).round(2))

    print("\nReferrer Presence:")
    print("Humans:", (human_df['referrer'].str.strip() != '-').mean())
    print("Bots:", (bot_df['referrer'].str.strip() != '-').mean())

In [47]:
# Run analysis for phase 1 data
file_path_1 = "data/interim/web_log_phase1.csv"
df1 = pd.read_csv(file_path_1, parse_dates=["datetime"])
analyze_requests(df1)
compare_humans_bots(df1)

Total Requests: 96234
Unique Sessions: 446
Categories: category
humans    57389
bots      38845
Name: count, dtype: int64

Average Requests per Session:
Humans: 217.38257575757575
Bots: 211.1141304347826

Average Duration Between Requests:
Humans: 0 days 00:00:10.945873085
Bots: 0 days 00:00:06.712009518

Request Type Distribution:
Humans: method
POST       85.82
GET        13.44
OPTIONS     0.73
CONNECT     0.01
HEAD        0.00
Name: proportion, dtype: float64
Bots: method
POST       80.89
GET        17.99
OPTIONS     1.11
Name: proportion, dtype: float64

Referrer Presence:
Humans: 0.9859729216400356
Bots: 0.9832668297078131


In [48]:
# Run analysis for phase 2 data
file_path_2 = "data/interim/web_log_phase2.csv"
df2 = pd.read_csv(file_path_2, parse_dates=["datetime"])
analyze_requests(df2)
compare_humans_bots(df2)

Total Requests: 209138
Unique Sessions: 369
Categories: category
humans    156786
bots       52352
Name: count, dtype: int64

Average Requests per Session:
Humans: 1224.890625
Bots: 216.3305785123967

Average Duration Between Requests:
Humans: 0 days 00:02:53.917769919
Bots: 0 days 00:00:01.772270197

Request Type Distribution:
Humans: method
POST       95.80
GET         4.06
OPTIONS     0.08
CONNECT     0.06
HEAD        0.00
Name: proportion, dtype: float64
Bots: method
POST       89.29
GET        10.35
OPTIONS     0.36
Name: proportion, dtype: float64

Referrer Presence:
Humans: 0.9678670289439109
Bots: 0.9917481662591687
