# Cleaning Cheryl's Chrome Search Data

In [72]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import pytz

In [73]:
# Import Chrome history data from personal account and turn into dataframe
personal_df = pd.read_json("raw_data/CB_data/history_personal.json")
personal_df.head(2)

Unnamed: 0,id,isLocal,referringVisitId,transition,visitId,visitTime,title,lastVisitTime,typedCount,url,visitCount
0,16741,True,0,link,46198,1739468000000.0,Export Chrome History - Chrome Web Store,1739468000000.0,0,https://chromewebstore.google.com/detail/expor...,2
1,16741,True,46196,link,46197,1739468000000.0,Export Chrome History - Chrome Web Store,1739468000000.0,0,https://chromewebstore.google.com/detail/expor...,2


In [74]:
# Convert the visit time to a datetime
personal_df["visitTime"] = personal_df["visitTime"] / 1000
personal_df["visitTime"] = pd.to_datetime(personal_df["visitTime"], unit="s", utc=True).dt.tz_convert(pytz.timezone('US/Eastern')).dt.tz_localize(None)
personal_df.head(2)

Unnamed: 0,id,isLocal,referringVisitId,transition,visitId,visitTime,title,lastVisitTime,typedCount,url,visitCount
0,16741,True,0,link,46198,2025-02-13 12:35:22.919519901,Export Chrome History - Chrome Web Store,1739468000000.0,0,https://chromewebstore.google.com/detail/expor...,2
1,16741,True,46196,link,46197,2025-02-13 12:35:22.265860081,Export Chrome History - Chrome Web Store,1739468000000.0,0,https://chromewebstore.google.com/detail/expor...,2


In [75]:
# Import Chrome history data from UVA account and turn into dataframe
uva_df = pd.read_csv("raw_data/CB_data/visits_uva.csv")
uva_df.head(2)

Unnamed: 0,id,url,visit_time,from_visit,transition,segment_id,visit_duration,incremented_omnibox_typed_score,opener_visit,originator_cache_guid,originator_visit_id,originator_from_visit,originator_opener_visit,is_known_to_sync,consider_for_ntp_most_visited,external_referrer_url,visited_link_id,app_id,visit_time_formatted
0,424475,117331,13376199127105962,0,838860805,0,1848975,0,0,,0,0,0,0,1,,0,,2024-11-16 02:52:07
1,424476,117332,13376199128951155,424475,268435456,0,0,0,0,,0,0,0,0,1,,31787,,2024-11-16 02:52:08


In [76]:
# Get urls from chrome search history data and merge into dataframe
urls_uva = pd.read_csv("raw_data/CB_data/urls_uva.csv")
uva_df = pd.merge(uva_df, urls_uva, left_on="url", right_on="id", how="inner")
uva_df.head(2)

Unnamed: 0,id_x,url_x,visit_time,from_visit,transition,segment_id,visit_duration,incremented_omnibox_typed_score,opener_visit,originator_cache_guid,...,visited_link_id,app_id,visit_time_formatted,id_y,url_y,title,visit_count,typed_count,last_visit_time,hidden
0,424475,117331,13376199127105962,0,838860805,0,1848975,0,0,,...,0,,2024-11-16 02:52:07,117331,https://www.google.com/search?q=lululemon+like...,lululemon like new - Google Search,1,0,13376199127105962,0
1,424476,117332,13376199128951155,424475,268435456,0,0,0,0,,...,31787,,2024-11-16 02:52:08,117332,https://www.google.com/aclk?sa=l&ai=DChcSEwilr...,lululemon Like New | Gently used gear for resale,1,0,13376199128951155,0


In [77]:
# Select relevant columns, rename, and combine
personal_df = personal_df[["title", "visitTime"]]
uva_df = uva_df[["title", "visit_time_formatted"]]
personal_df = personal_df.rename(columns={"title": "chrome_title", "visitTime": "date_time"})
uva_df = uva_df.rename(columns={"title": "chrome_title", "visit_time_formatted": "date_time"})
df = pd.concat([personal_df, uva_df])

# # Split the date and time
df["date_time"] = pd.to_datetime(df["date_time"])

# Extract the date and time into separate columns
df["date"] = df["date_time"].dt.date
df["time"] = df["date_time"].dt.strftime("%H:%M:%S")

df.head(2)

Unnamed: 0,chrome_title,date_time,date,time
0,Export Chrome History - Chrome Web Store,2025-02-13 12:35:22.919519901,2025-02-13,12:35:22
1,Export Chrome History - Chrome Web Store,2025-02-13 12:35:22.265860081,2025-02-13,12:35:22


In [78]:
# Sort by date and time
df = df.sort_values(by=["date_time"]).reset_index(drop=True)
df.head(2)

Unnamed: 0,chrome_title,date_time,date,time
0,lululemon like new - Google Search,2024-11-16 02:52:07,2024-11-16,02:52:07
1,lululemon Like New | Gently used gear for resale,2024-11-16 02:52:08,2024-11-16,02:52:08


In [79]:
# Tally up keywords for school related searches
keywords = ["uva", "canvas", "eqk9vb", "drive"]

tally_data = []
last_time_per_date = {}

for _, row in df.iterrows():
    row_tally = {"date": row["date"], "total_school_keyword_count": 0}
    
    # Count occurrences of each keyword
    for keyword in keywords:
        count = sum(row.astype(str).str.contains(keyword, case=False, na=False))
        row_tally[keyword] = count
        row_tally["total_school_keyword_count"] += count

    if row["date"] not in last_time_per_date:
        last_time_per_date[row["date"]] = row["time"]
    else:
        if row["time"] > last_time_per_date[row["date"]]:
            last_time_per_date[row["date"]] = row["time"]
    
    tally_data.append(row_tally)

tally_df = pd.DataFrame(tally_data)
summary_df = tally_df.groupby("date", as_index=False).sum()
summary_df["last_school_keyword_visit_time"] = summary_df["date"].map(last_time_per_date)
summary_df.head(2)

Unnamed: 0,date,total_school_keyword_count,uva,canvas,eqk9vb,drive,last_school_keyword_visit_time
0,2024-11-16,0,0,0,0,0,06:24:21
1,2024-11-17,71,57,0,6,8,23:18:40


In [80]:
# Double check to see if any dates are missing, if so, add them to my df
summary_df["date"] = pd.to_datetime(summary_df["date"])

full_date_range = pd.date_range(start=summary_df["date"].min(), end=summary_df["date"].max(), freq="D")
missing_dates = full_date_range[~full_date_range.isin(summary_df["date"])]

missing_rows = []
for missing_date in missing_dates:
    missing_row = {"date": missing_date, "total_school_keyword_count": 0, "uva": 0, "canvas": 0, "eqk9vb": 0, "drive": 0}
    missing_rows.append(missing_row)
missing_df = pd.DataFrame(missing_rows)

summary_df = pd.concat([summary_df, missing_df], ignore_index=True)
summary_df = summary_df.sort_values("date").reset_index(drop=True)

summary_df.head(2)

Unnamed: 0,date,total_school_keyword_count,uva,canvas,eqk9vb,drive,last_school_keyword_visit_time
0,2024-11-16,0,0,0,0,0,06:24:21
1,2024-11-17,71,57,0,6,8,23:18:40


In [81]:
# Construct a visit summary
def get_first_and_last_visits(df):
    # Find first and last visited websites per date
    first_visits = df.loc[df.groupby("date")["time"].idxmin()]
    last_visits = df.loc[df.groupby("date")["time"].idxmax()]

    # Select relevant columns
    first_visits = first_visits[["date", "time", "chrome_title"]].rename(columns={
        "time": "first_visited_time",
        "chrome_title": "first_visited_chrome_title"
    })
    
    last_visits = last_visits[["date", "time", "chrome_title"]].rename(columns={
        "time": "last_visited_time",
        "chrome_title": "last_visited_chrome_title"
    })

    # Merge both DataFrames on visit_date to get one row per date
    visit_summary = pd.merge(first_visits, last_visits, on="date")
    return visit_summary

df = get_first_and_last_visits(df)
df.head(2)

Unnamed: 0,date,first_visited_time,first_visited_chrome_title,last_visited_time,last_visited_chrome_title
0,2024-11-16,02:52:07,lululemon like new - Google Search,06:24:21,research summary - Google Docs
1,2024-11-17,19:00:13,gmail - Google Search,23:18:40,mse loss func - Google Search


In [82]:
df["date"] = pd.to_datetime(df["date"])
df = pd.merge(df, summary_df, on="date", how="right")
df.head(2)

Unnamed: 0,date,first_visited_time,first_visited_chrome_title,last_visited_time,last_visited_chrome_title,total_school_keyword_count,uva,canvas,eqk9vb,drive,last_school_keyword_visit_time
0,2024-11-16,02:52:07,lululemon like new - Google Search,06:24:21,research summary - Google Docs,0,0,0,0,0,06:24:21
1,2024-11-17,19:00:13,gmail - Google Search,23:18:40,mse loss func - Google Search,71,57,0,6,8,23:18:40


In [83]:
# Export data for later
df.to_csv("cleaned_data/cheryl_chrome.csv")