# Cheryl Youtube History Analysis   

In [6]:
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

# Load the HTML file (adjust the file name/path as needed)
with open("CB_data/watch-history.html", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

# Inspect your HTML file to identify the right tags/classes.
# Here, we assume each history entry is within a div with class 'content-cell'
entries = soup.find_all("div", class_="content-cell")

data = []
for entry in entries:
    # Extract the title (YouTube video link text)
    title_tag = entry.find("a")
    title = title_tag.get_text(strip=True) if title_tag else None

    # Extract the URL (optional)
    url = title_tag["href"] if title_tag and "href" in title_tag.attrs else None

    # Extract the date/time
    time_str = entry.text.strip().split("\n")[-1]  # Extract last line, usually the date
    time_str = time_str.strip() if time_str else None

    # Try to parse the date (modify format if needed)
    date_time = None
    if time_str:
        try:
            date_time = datetime.strptime(time_str, "%B %d, %Y, %I:%M:%S %p UTC")
        except ValueError:
            date_time = time_str  # Keep original string if parsing fails

    # Only append if there's a valid title (to avoid extra records)
    if title:
        data.append({"title": title, "url": url, "date_time": date_time})

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the first few rows
df.head()

Unnamed: 0,title,url,date_time
0,HALF HORSE HALF MAN | OFFICIAL VIDEO,https://www.youtube.com/watch?v=6v_R180kIGs,Watched HALF HORSE HALF MAN | OFFICIAL VIDEOOC...
1,here,https://myaccount.google.com/activitycontrols,Products: YouTubeWhy is this here? This activi...
2,Bluehost's ✨NEW✨ AI Website Builder,https://www.youtube.com/watch?v=u3UEiKiBlgE,Watched Bluehost's ✨NEW✨ AI Website BuilderFeb...
3,here,https://myaccount.google.com/activitycontrols,Products: YouTubeDetails: From Google AdsWhy i...
4,"[Solo-Leveling]SymphonicSuite-Lv.1 → Lv.2"" Mus...",https://www.youtube.com/watch?v=lmajFEi1Hdk,Watched [Solo-Leveling]SymphonicSuite-Lv.1 → L...


In [7]:
import re
from dateutil import parser

def parse_date_time(text):
    pattern = r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4},\s+\d{1,2}:\d{2}:\d{2}\s+[AP]M(?:\s+[A-Z]+)?'
    match = re.search(pattern, text)
    if match:
        date_str = match.group(0)
        # print("Extracted date string:", date_str)
        dt = parser.parse(date_str)
        return dt
    else:
        # print("No match found in:", text)
        return None

df["date_time"] = df["date_time"].apply(parse_date_time)
df.head()




Unnamed: 0,title,url,date_time
0,HALF HORSE HALF MAN | OFFICIAL VIDEO,https://www.youtube.com/watch?v=6v_R180kIGs,2025-02-12 23:53:21
1,here,https://myaccount.google.com/activitycontrols,NaT
2,Bluehost's ✨NEW✨ AI Website Builder,https://www.youtube.com/watch?v=u3UEiKiBlgE,2025-02-12 23:53:09
3,here,https://myaccount.google.com/activitycontrols,NaT
4,"[Solo-Leveling]SymphonicSuite-Lv.1 → Lv.2"" Mus...",https://www.youtube.com/watch?v=lmajFEi1Hdk,2025-02-10 20:54:30


In [8]:
# remove rows without a date_time
df = df.dropna(subset=["date_time"])
df.head()

Unnamed: 0,title,url,date_time
0,HALF HORSE HALF MAN | OFFICIAL VIDEO,https://www.youtube.com/watch?v=6v_R180kIGs,2025-02-12 23:53:21
2,Bluehost's ✨NEW✨ AI Website Builder,https://www.youtube.com/watch?v=u3UEiKiBlgE,2025-02-12 23:53:09
4,"[Solo-Leveling]SymphonicSuite-Lv.1 → Lv.2"" Mus...",https://www.youtube.com/watch?v=lmajFEi1Hdk,2025-02-10 20:54:30
6,【MV】『SHADOWBORN』 Hiroyuki SAWANO feat. Benjami...,https://www.youtube.com/watch?v=qUFRPDHs1Q8,2025-02-10 20:51:10
9,DCD-The-2530,https://www.youtube.com/watch?v=dO1MRlcmj60,2025-02-10 20:43:41


In [9]:
# separate date and time and reformat
df["date"] = df["date_time"].dt.date
df["time"] = df["date_time"].dt.strftime("%H:%M:%S")
df.drop("date_time", axis=1, inplace=True)
df.head()

Unnamed: 0,title,url,date,time
0,HALF HORSE HALF MAN | OFFICIAL VIDEO,https://www.youtube.com/watch?v=6v_R180kIGs,2025-02-12,23:53:21
2,Bluehost's ✨NEW✨ AI Website Builder,https://www.youtube.com/watch?v=u3UEiKiBlgE,2025-02-12,23:53:09
4,"[Solo-Leveling]SymphonicSuite-Lv.1 → Lv.2"" Mus...",https://www.youtube.com/watch?v=lmajFEi1Hdk,2025-02-10,20:54:30
6,【MV】『SHADOWBORN』 Hiroyuki SAWANO feat. Benjami...,https://www.youtube.com/watch?v=qUFRPDHs1Q8,2025-02-10,20:51:10
9,DCD-The-2530,https://www.youtube.com/watch?v=dO1MRlcmj60,2025-02-10,20:43:41
