# Feature Engineering of Weblog data
1. count of Days of Week where session happened
2. count of Time of Day ('awake' - 6am-11pm , 'sleep' - 12am-5am)
3. most frequent site/url
4. most frequent referral
5. percentage of each type of method used
6. percentage of each type of status 
7. average byte size

In [5]:
import pandas as pd
import zipfile
import datetime as dt

In [None]:
zip_path = r'../data/interim/weblog_p1p2.zip'  
extract_folder = r'../data/interim/weblog/'

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

In [23]:
weblog_df = pd.read_csv(extract_folder + 'weblog_p1p2.csv', parse_dates=["datetime"])

In [24]:
weblog_df["day_of_week"] = weblog_df["datetime"].dt.day_name()
weblog_df["hour"] = weblog_df["datetime"].dt.hour
weblog_df["time_category"] = weblog_df["hour"].apply(lambda x: "awake" if 6 <= x <= 23 else "sleep")

In [33]:
# Pivot `method` and `status` to one-hot encoding
method_pivot = pd.crosstab(weblog_df["Session_ID"], weblog_df["method"]).reset_index()
status_pivot = pd.crosstab(weblog_df["Session_ID"], weblog_df["status"]).reset_index()

# Pivot `day_of_week` and `time_category`
days_pivot = pd.crosstab(weblog_df["Session_ID"], weblog_df["day_of_week"]).reset_index()
time_pivot = pd.crosstab(weblog_df["Session_ID"], weblog_df["time_category"]).reset_index()

test = pd.concat([method_pivot, status_pivot, days_pivot, time_pivot], axis=1)

In [31]:
# Group by Session_ID
session_grouped = weblog_df.groupby(['Session_ID','category']).agg(
    url = ('url', lambda x: x.mode()[0]), # Most frequent URL per session
    referrer = ('referrer', lambda x: x.mode()[0]),  # Most frequent referrer per session
    byte_size = ('byte_size', 'mean'),  # Average byte size per session
).reset_index()

final_df = (
    session_grouped
    .merge(method_pivot, on="Session_ID", how="left")
    .merge(status_pivot, on="Session_ID", how="left")
    .merge(days_pivot, on="Session_ID", how="left")
    .merge(time_pivot, on="Session_ID", how="left")
    .fillna(0)
)