# Cheryl Youtube History Analysis   

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

# Load the HTML file (adjust the file name/path as needed)
with open("CB_data/watch-history.html", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

# Inspect your HTML file to identify the right tags/classes.
# Here, we assume each history entry is within a div with class 'content-cell'
entries = soup.find_all("div", class_="content-cell")

data = []
for entry in entries:
    # Extract the title (YouTube video link text)
    title_tag = entry.find("a")
    title = title_tag.get_text(strip=True) if title_tag else None

    # Extract the URL (optional)
    url = title_tag["href"] if title_tag and "href" in title_tag.attrs else None

    # Extract the date/time
    time_str = entry.text.strip().split("\n")[-1]  # Extract last line, usually the date
    time_str = time_str.strip() if time_str else None

    # Try to parse the date (modify format if needed)
    date_time = None
    if time_str:
        try:
            date_time = datetime.strptime(time_str, "%B %d, %Y, %I:%M:%S %p UTC")
        except ValueError:
            date_time = time_str  # Keep original string if parsing fails

    # Only append if there's a valid title (to avoid extra records)
    if title:
        data.append({"title": title, "url": url, "date_time": date_time})

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the first few rows
df.head()

Unnamed: 0,title,url,date_time
0,HALF HORSE HALF MAN | OFFICIAL VIDEO,https://www.youtube.com/watch?v=6v_R180kIGs,Watched HALF HORSE HALF MAN | OFFICIAL VIDEOOC...
1,here,https://myaccount.google.com/activitycontrols,Products: YouTubeWhy is this here? This activi...
2,Bluehost's ✨NEW✨ AI Website Builder,https://www.youtube.com/watch?v=u3UEiKiBlgE,Watched Bluehost's ✨NEW✨ AI Website BuilderFeb...
3,here,https://myaccount.google.com/activitycontrols,Products: YouTubeDetails: From Google AdsWhy i...
4,"[Solo-Leveling]SymphonicSuite-Lv.1 → Lv.2"" Mus...",https://www.youtube.com/watch?v=lmajFEi1Hdk,Watched [Solo-Leveling]SymphonicSuite-Lv.1 → L...


In [3]:
import re
from dateutil import parser

def parse_date_time(text):
    pattern = r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4},\s+\d{1,2}:\d{2}:\d{2}\s+[AP]M(?:\s+[A-Z]+)?'
    match = re.search(pattern, text)
    if match:
        date_str = match.group(0)
        # print("Extracted date string:", date_str)
        dt = parser.parse(date_str)
        return dt
    else:
        # print("No match found in:", text)
        return None

df["date_time"] = df["date_time"].apply(parse_date_time)
df.head()




Unnamed: 0,title,url,date_time
0,HALF HORSE HALF MAN | OFFICIAL VIDEO,https://www.youtube.com/watch?v=6v_R180kIGs,2025-02-12 23:53:21
1,here,https://myaccount.google.com/activitycontrols,NaT
2,Bluehost's ✨NEW✨ AI Website Builder,https://www.youtube.com/watch?v=u3UEiKiBlgE,2025-02-12 23:53:09
3,here,https://myaccount.google.com/activitycontrols,NaT
4,"[Solo-Leveling]SymphonicSuite-Lv.1 → Lv.2"" Mus...",https://www.youtube.com/watch?v=lmajFEi1Hdk,2025-02-10 20:54:30


In [49]:
# remove rows without a date_time
df = df.dropna(subset=["date_time"])
df.head()

Unnamed: 0,title,url,date_time
0,HALF HORSE HALF MAN | OFFICIAL VIDEO,https://www.youtube.com/watch?v=6v_R180kIGs,2025-02-12 23:53:21
2,Bluehost's ✨NEW✨ AI Website Builder,https://www.youtube.com/watch?v=u3UEiKiBlgE,2025-02-12 23:53:09
4,"[Solo-Leveling]SymphonicSuite-Lv.1 → Lv.2"" Mus...",https://www.youtube.com/watch?v=lmajFEi1Hdk,2025-02-10 20:54:30
6,【MV】『SHADOWBORN』 Hiroyuki SAWANO feat. Benjami...,https://www.youtube.com/watch?v=qUFRPDHs1Q8,2025-02-10 20:51:10
9,DCD-The-2530,https://www.youtube.com/watch?v=dO1MRlcmj60,2025-02-10 20:43:41


In [50]:
# separate date and time and reformat
df["date"] = df["date_time"].dt.date
df["time"] = df["date_time"].dt.strftime("%H:%M:%S")
df.drop("date_time", axis=1, inplace=True)
df["type"] = "watch"
df.head()

Unnamed: 0,title,url,date,time,type
0,HALF HORSE HALF MAN | OFFICIAL VIDEO,https://www.youtube.com/watch?v=6v_R180kIGs,2025-02-12,23:53:21,watch
2,Bluehost's ✨NEW✨ AI Website Builder,https://www.youtube.com/watch?v=u3UEiKiBlgE,2025-02-12,23:53:09,watch
4,"[Solo-Leveling]SymphonicSuite-Lv.1 → Lv.2"" Mus...",https://www.youtube.com/watch?v=lmajFEi1Hdk,2025-02-10,20:54:30,watch
6,【MV】『SHADOWBORN』 Hiroyuki SAWANO feat. Benjami...,https://www.youtube.com/watch?v=qUFRPDHs1Q8,2025-02-10,20:51:10,watch
9,DCD-The-2530,https://www.youtube.com/watch?v=dO1MRlcmj60,2025-02-10,20:43:41,watch


In [51]:
# Load the HTML file (adjust the file name/path as needed)
with open("CB_data/search-history.html", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

# Inspect your HTML file to identify the right tags/classes.
# Here, we assume each history entry is within a div with class 'content-cell'
entries = soup.find_all("div", class_="content-cell")

data = []
for entry in entries:
    # Extract the title (YouTube video link text)
    title_tag = entry.find("a")
    title = title_tag.get_text(strip=True) if title_tag else None

    # Extract the URL (optional)
    url = title_tag["href"] if title_tag and "href" in title_tag.attrs else None

    # Extract the date/time
    time_str = entry.text.strip().split("\n")[-1]  # Extract last line, usually the date
    time_str = time_str.strip() if time_str else None

    # Try to parse the date (modify format if needed)
    date_time = None
    if time_str:
        try:
            date_time = datetime.strptime(time_str, "%B %d, %Y, %I:%M:%S %p UTC")
        except ValueError:
            date_time = time_str  # Keep original string if parsing fails

    # Only append if there's a valid title (to avoid extra records)
    if title:
        data.append({"title": title, "url": url, "date_time": date_time})

# Convert to DataFrame
df_search = pd.DataFrame(data)

# Display the first few rows
df_search.head()

Unnamed: 0,title,url,date_time
0,half horse half man song,https://www.youtube.com/results?search_query=h...,"Searched for half horse half man songFeb 12, 2..."
1,here,https://myaccount.google.com/activitycontrols,Products: YouTubeWhy is this here? This activi...
2,Gymshark - Lift Seamless USP,https://www.youtube.com/watch?v=yJuD5y55kZ0,Watched Gymshark - Lift Seamless USPWatched at...
3,here,https://myaccount.google.com/activitycontrols,Products: YouTubeDetails: From Google AdsWhy i...
4,lululemon | FAWI Seasonal 2024 | 15s | YOGAwCO...,https://www.youtube.com/watch?v=kpb97e0nYmY,Watched lululemon | FAWI Seasonal 2024 | 15s |...


In [52]:
# # parse for word searched
df_search.loc[df_search["date_time"].str.contains(r"\bsearched\b", case=False, na=False), "type"] = "search"
df_search["date_time"] = df_search["date_time"].apply(parse_date_time)
df_search.head(10)



Unnamed: 0,title,url,date_time,type
0,half horse half man song,https://www.youtube.com/results?search_query=h...,2025-02-12 23:53:16,search
1,here,https://myaccount.google.com/activitycontrols,NaT,
2,Gymshark - Lift Seamless USP,https://www.youtube.com/watch?v=yJuD5y55kZ0,2025-02-10 20:43:09,
3,here,https://myaccount.google.com/activitycontrols,NaT,
4,lululemon | FAWI Seasonal 2024 | 15s | YOGAwCO...,https://www.youtube.com/watch?v=kpb97e0nYmY,2025-02-10 20:38:02,
5,here,https://myaccount.google.com/activitycontrols,NaT,
6,(2/18) Now Taxes is Free in the App - TurboTa...,https://www.youtube.com/watch?v=Ua9ffbSO-20,2025-02-08 00:10:47,
7,here,https://myaccount.google.com/activitycontrols,NaT,
8,boston junior toi,https://www.youtube.com/results?search_query=b...,2025-02-08 00:10:27,search
9,here,https://myaccount.google.com/activitycontrols,NaT,


In [53]:
df_search["date"] = df_search["date_time"].dt.date
df_search["time"] = df_search["date_time"].dt.strftime("%H:%M:%S")
df_search = df_search.dropna(subset=["date_time"])
df_search = df_search.dropna(subset=["type"])
df_search.drop("date_time", axis=1, inplace=True)
df_search.head()

Unnamed: 0,title,url,type,date,time
0,half horse half man song,https://www.youtube.com/results?search_query=h...,search,2025-02-12,23:53:16
8,boston junior toi,https://www.youtube.com/results?search_query=b...,search,2025-02-08,00:10:27
12,pikmin songs,https://www.youtube.com/results?search_query=p...,search,2025-02-06,01:48:52
16,us figure skating championships 2025 jon marav...,https://www.youtube.com/results?search_query=u...,search,2025-01-22,21:00:21
20,us figure skating championships 2025,https://www.youtube.com/results?search_query=u...,search,2025-01-22,20:55:16


In [54]:
combined_df = pd.concat([df, df_search], ignore_index=True)
combined_df.sort_values(by=["date", "time"], ascending=[False, False], inplace=True)
combined_df.tail()

Unnamed: 0,title,url,date,time,type
352,guess the anime opening in 5 seconds - 50 open...,https://www.youtube.com/watch?v=sYuxazpqXaI,2023-05-24,21:00:40,watch
353,How to Get Unlimited Rare Candies in Pokemon P...,https://www.youtube.com/watch?v=JbntyXu48bQ,2023-05-20,14:39:41,watch
354,Foundry Reference Project | Overview,https://www.youtube.com/watch?v=fAX5VXctpCA,2023-03-07,22:52:53,watch
355,I Open 100x Pokémon GO Booster Packs,https://www.youtube.com/watch?v=oayAEsJpPaQ,2022-07-06,21:56:16,watch
409,pokemon go packs,https://www.youtube.com/results?search_query=p...,2022-07-06,21:55:23,search
