# Cleaning Elaine's Chrome Search Data

In [49]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime, timedelta
import json
import matplotlib.pyplot as plt

In [50]:
# Import Chrome history data and turn into dataframe
file_path = "raw_data/EZ_data/history_chrome.json"

with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)  # Load JSON data

history_list = data["Browser History"]  # Extract history data
df = pd.DataFrame(history_list)
df.head(1)

Unnamed: 0,favicon_url,page_transition_qualifier,title,url,time_usec,client_id
0,https://www.google.com/favicon.ico,CLIENT_REDIRECT,Google Takeout,https://takeout.google.com/,1740333261934643,GDY74/DpgCDxMCldA15OWg==


In [51]:
# Convert to Eastern Time (EST/EDT)
df["datetime"] = pd.to_datetime(df["time_usec"], unit="us", utc=True)
df["datetime"] = df["datetime"].dt.tz_convert("US/Eastern")
df["date"] = df["datetime"].dt.date
df["time"] = df["datetime"].dt.strftime("%H:%M:%S")
df.head(3)

Unnamed: 0,favicon_url,page_transition_qualifier,title,url,time_usec,client_id,datetime,date,time
0,https://www.google.com/favicon.ico,CLIENT_REDIRECT,Google Takeout,https://takeout.google.com/,1740333261934643,GDY74/DpgCDxMCldA15OWg==,2025-02-23 12:54:21.934643-05:00,2025-02-23,12:54:21
1,https://www.google.com/favicon.ico,CLIENT_REDIRECT,Google Takeout,https://takeout.google.com/,1740333259460484,GDY74/DpgCDxMCldA15OWg==,2025-02-23 12:54:19.460484-05:00,2025-02-23,12:54:19
2,,CLIENT_REDIRECT,cs6501-4501-analyzing-online-behavior,https://henrykautz.com/Teaching/cs6501-4501-an...,1740333252069420,GDY74/DpgCDxMCldA15OWg==,2025-02-23 12:54:12.069420-05:00,2025-02-23,12:54:12


In [52]:
# Select relevant columns and rename
df = df[["title", "datetime", "date", "time", "url"]]
df = df.rename(columns={"title": "chrome_title", "datetime": "date_time", "url": "chrome_url"})

# Sort by date and time
df = df.sort_values(by=["date_time"]).reset_index(drop=True)
df.head(2)

Unnamed: 0,chrome_title,date_time,date,time,chrome_url
0,demo-music-app/js/script.js at 90a0bf5dd9aa815...,2024-02-24 13:21:35.415002-05:00,2024-02-24,13:21:35,https://github.com/elaine-zhang12/demo-music-a...
1,demo-music-app/css/styles.css at 90a0bf5dd9aa8...,2024-02-24 13:21:37.008110-05:00,2024-02-24,13:21:37,https://github.com/elaine-zhang12/demo-music-a...


In [53]:
# Tally up keywords for school related searches
keywords = ["uva", "canvas", "zzb2rf", "drive"]

tally_data = []
last_time_per_date = {}

for _, row in df.iterrows():
    row_tally = {"date": row["date"], "total_school_keyword_count": 0}
    
    # Count occurrences of each keyword
    for keyword in keywords:
        count = sum(row.astype(str).str.contains(keyword, case=False, na=False))
        row_tally[keyword] = count
        row_tally["total_school_keyword_count"] += count

    if row["date"] not in last_time_per_date:
        last_time_per_date[row["date"]] = row["time"]
    else:
        if row["time"] > last_time_per_date[row["date"]]:
            last_time_per_date[row["date"]] = row["time"]
    
    tally_data.append(row_tally)

tally_df = pd.DataFrame(tally_data)
summary_df = tally_df.groupby("date", as_index=False).sum()
summary_df["last_school_keyword_visit_time"] = summary_df["date"].map(last_time_per_date)
summary_df.head(2)

Unnamed: 0,date,total_school_keyword_count,uva,canvas,zzb2rf,drive,last_school_keyword_visit_time
0,2024-02-24,284,275,9,0,0,23:46:30
1,2024-02-25,88,62,16,0,10,23:25:44


In [54]:
# Double check to see if any dates are missing, if so, add them to my df
summary_df["date"] = pd.to_datetime(summary_df["date"])

full_date_range = pd.date_range(start=summary_df["date"].min(), end=summary_df["date"].max(), freq="D")
missing_dates = full_date_range[~full_date_range.isin(summary_df["date"])]

missing_rows = []
for missing_date in missing_dates:
    missing_row = {"date": missing_date, "total_school_keyword_count": 0, "uva": 0, "canvas": 0, "zzb2rf": 0, "drive": 0}
    missing_rows.append(missing_row)
missing_df = pd.DataFrame(missing_rows)

summary_df = pd.concat([summary_df, missing_df], ignore_index=True)
summary_df = summary_df.sort_values("date").reset_index(drop=True)

summary_df.head(2)

Unnamed: 0,date,total_school_keyword_count,uva,canvas,zzb2rf,drive,last_school_keyword_visit_time
0,2024-02-24,284,275,9,0,0,23:46:30
1,2024-02-25,88,62,16,0,10,23:25:44


In [55]:
# Construct a visit summary
def get_first_and_last_visits(df):
    # Find first and last visited websites per date
    first_visits = df.loc[df.groupby("date")["time"].idxmin()]
    last_visits = df.loc[df.groupby("date")["time"].idxmax()]

    # Select relevant columns
    first_visits = first_visits[["date", "time", "chrome_title"]].rename(columns={
        "time": "first_visited_time",
        "chrome_title": "first_visited_chrome_title"
    })
    
    last_visits = last_visits[["date", "time", "chrome_title"]].rename(columns={
        "time": "last_visited_time",
        "chrome_title": "last_visited_chrome_title"
    })

    # Merge both DataFrames on visit_date to get one row per date
    visit_summary = pd.merge(first_visits, last_visits, on="date")
    return visit_summary

df = get_first_and_last_visits(df)
df.head(2)

Unnamed: 0,date,first_visited_time,first_visited_chrome_title,last_visited_time,last_visited_chrome_title
0,2024-02-24,13:21:35,demo-music-app/js/script.js at 90a0bf5dd9aa815...,23:46:30,Devpost
1,2024-02-25,00:10:59,Emily Chang's (ec5ug) software portfolio | Dev...,23:25:44,Carbon Conscious | Devpost


In [56]:
df["date"] = pd.to_datetime(df["date"])
df = pd.merge(df, summary_df, on="date", how="right")
df.head(2)

Unnamed: 0,date,first_visited_time,first_visited_chrome_title,last_visited_time,last_visited_chrome_title,total_school_keyword_count,uva,canvas,zzb2rf,drive,last_school_keyword_visit_time
0,2024-02-24,13:21:35,demo-music-app/js/script.js at 90a0bf5dd9aa815...,23:46:30,Devpost,284,275,9,0,0,23:46:30
1,2024-02-25,00:10:59,Emily Chang's (ec5ug) software portfolio | Dev...,23:25:44,Carbon Conscious | Devpost,88,62,16,0,10,23:25:44


In [57]:
df.to_csv("cleaned_data/elaine_chrome.csv")