In [10]:
from bs4 import BeautifulSoup

def extract_first_aria_hidden_text(el):
    """Get the first aria-hidden='true' span text, else ''."""
    if not el:
        return ""
    span = el.find("span", attrs={"aria-hidden": "true"})
    return span.get_text(strip=True) if span else el.get_text(strip=True)

def get_text_or_none(el):
    return el.get_text(strip=True) if el else ""

def get_role_details(role_li, company_name, logo_url, location_hint=None):
    # Title
    title_div = role_li.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold")
    title = extract_first_aria_hidden_text(title_div)
    # Dates
    date_span = role_li.find("span", class_="t-14 t-normal t-black--light")
    dates = extract_first_aria_hidden_text(date_span)
    # Description
    desc_div = role_li.find("div", class_="display-flex align-items-center t-14 t-normal t-black")
    description = extract_first_aria_hidden_text(desc_div)
    # Skills
    skills = ""
    skills_div = role_li.find("span", string=lambda t: t and "Skills:" in t)
    if skills_div:
        skills = skills_div.parent.get_text(strip=True).replace("Skills:", "").strip()
    # Location
    spans = role_li.find_all("span", class_="t-14 t-normal t-black--light")
    location = ""
    if len(spans) > 1:
        location = extract_first_aria_hidden_text(spans[1])
    elif location_hint:
        location = location_hint
    return {
        "company": company_name,
        "logo_url": logo_url,
        "title": title,
        "dates": dates,
        "location": location,
        "description": description,
        "skills": skills,
    }

with open("linkedin_experience.html", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

exp_section = soup.find("section", class_="artdeco-card pb3")
exp_ul = exp_section.find("ul")
all_experiences = []

for exp_li in exp_ul.find_all("li", class_="pvs-list__paged-list-item", recursive=False):
    # Company name and logo url
    company_link_tag = exp_li.find("a", class_="optional-action-target-wrapper pvs-entity__image-container--outline-offset display-flex")
    company_img = company_link_tag.find("img") if company_link_tag else None
    company_name = company_img["alt"] if company_img and "alt" in company_img.attrs else None
    logo_url = company_img["src"] if company_img and "src" in company_img.attrs else None

    # Company location (for stacked roles fallback)
    location_hint = None
    company_location_span = exp_li.find("span", class_="t-14 t-normal t-black--light")
    if company_location_span:
        location_hint = extract_first_aria_hidden_text(company_location_span)

    # Case 1: Stacked/grouped experience block (e.g. multiple roles at a company)
    nested_container = exp_li.find("div", class_="pvs-list__container")
    if nested_container:
        inner_ul = nested_container.find("ul")
        if inner_ul:
            for role_li in inner_ul.find_all("li", class_="pvs-list__paged-list-item", recursive=False):
                all_experiences.append(
                    get_role_details(role_li, company_name, logo_url, location_hint)
                )
            continue  # done with this company

    # Case 2: Single role
    all_experiences.append(
        get_role_details(exp_li, company_name, logo_url)
    )

# Print results
for idx, exp in enumerate(all_experiences):
    print(f"====== EXPERIENCE #{idx+1} ======")
    print("Company:", exp['company'])
    print("Logo URL:", exp['logo_url'])
    print("Title:", exp['title'])
    print("Dates:", exp['dates'])
    print("Location:", exp['location'])
    print("Description:", exp['description'])
    print("Skills:", exp['skills'])
    print()


Company: Voya Financial logo
Logo URL: https://media.licdn.com/dms/image/v2/C4D0BAQEopHI6Hjie3g/company-logo_100_100/company-logo_100_100/0/1631364007955?e=1754524800&v=beta&t=0MRt4o14F8Gilqmakj6IKXHfRWMTsI1WL2_MLmmbClU
Title: Data Science Intern
Dates: May 2024 - Aug 2024 · 4 mos
Location: Los Angeles, California, United States
Description: • Developed a Data Quality Checker app using Python, SQL, and Snowflake to ensure 24/7 business validations on financial data, identifying outliers for review.• Conducted in-depth data analysis on portfolio data using Power BI and Python, investigating anomalies.• Completed three certification courses on data modeling for Snowflake data analysis and Datavault 2.0.
Skills: 

Company: Herman Ostrow School of Dentistry of USC logo
Logo URL: https://media.licdn.com/dms/image/v2/C4E0BAQGXUbwBxTEaGg/company-logo_100_100/company-logo_100_100/0/1631347499239?e=1754524800&v=beta&t=-RNSz0s7AyQpIbQPfXEJkXq1GYhk1boffGCcej-pp10
Title: Research Assistant
Dates: 

In [11]:
from bs4 import BeautifulSoup
import re

def extract_first_aria_hidden_text(el):
    """Get the first aria-hidden='true' span text, else ''."""
    if not el:
        return ""
    span = el.find("span", attrs={"aria-hidden": "true"})
    return span.get_text(strip=True) if span else el.get_text(strip=True)

def parse_date_range(datestr):
    # Look for patterns like "Jan 2020 - May 2021 · 1 yr 5 mos"
    # Returns ('Jan 2020', 'May 2021')
    m = re.match(r"([A-Za-z]{3,}\s+\d{4})\s*[-–]\s*([A-Za-z]{3,}\s+\d{4}|Present)", datestr)
    if m:
        return m.group(1), m.group(2)
    return "", ""  # fallback

def get_text_or_none(el):
    return el.get_text(strip=True) if el else ""

def get_role_details(role_li, company_name, company_url, logo_url, location_hint=None):
    # Title
    title_div = role_li.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold")
    title = extract_first_aria_hidden_text(title_div)
    # Dates
    date_span = role_li.find("span", class_="t-14 t-normal t-black--light")
    dates = extract_first_aria_hidden_text(date_span)
    from_date, to_date = parse_date_range(dates)
    # Description
    desc_div = role_li.find("div", class_="display-flex align-items-center t-14 t-normal t-black")
    description = extract_first_aria_hidden_text(desc_div)
    # Skills
    skills = ""
    skills_div = role_li.find("span", string=lambda t: t and "Skills:" in t)
    if skills_div:
        skills = skills_div.parent.get_text(strip=True).replace("Skills:", "").strip()
    # Location
    spans = role_li.find_all("span", class_="t-14 t-normal t-black--light")
    location = ""
    if len(spans) > 1:
        location = extract_first_aria_hidden_text(spans[1])
    elif location_hint:
        location = location_hint
    return {
        "company": company_name,
        "company_url": company_url,
        "logo_url": logo_url,
        "title": title,
        "from_date": from_date,
        "to_date": to_date,
        "dates": dates,
        "location": location,
        "description": description,
        "skills": skills,
    }

with open("linkedin_experience.html", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

exp_section = soup.find("section", class_="artdeco-card pb3")
exp_ul = exp_section.find("ul")
all_experiences = []

for exp_li in exp_ul.find_all("li", class_="pvs-list__paged-list-item", recursive=False):
    # Company name, logo url, company url
    company_link_tag = exp_li.find("a", class_="optional-action-target-wrapper pvs-entity__image-container--outline-offset display-flex")
    company_img = company_link_tag.find("img") if company_link_tag else None
    company_name = company_img["alt"] if company_img and "alt" in company_img.attrs else None
    logo_url = company_img["src"] if company_img and "src" in company_img.attrs else None
    company_url = company_link_tag.get("href", None) if company_link_tag else None

    # Company location (for stacked roles fallback)
    location_hint = None
    company_location_span = exp_li.find("span", class_="t-14 t-normal t-black--light")
    if company_location_span:
        location_hint = extract_first_aria_hidden_text(company_location_span)

    # Case 1: Stacked/grouped experience block (e.g. multiple roles at a company)
    nested_container = exp_li.find("div", class_="pvs-list__container")
    if nested_container:
        inner_ul = nested_container.find("ul")
        if inner_ul:
            for role_li in inner_ul.find_all("li", class_="pvs-list__paged-list-item", recursive=False):
                all_experiences.append(
                    get_role_details(role_li, company_name, company_url, logo_url, location_hint)
                )
            continue  # done with this company

    # Case 2: Single role
    all_experiences.append(
        get_role_details(exp_li, company_name, company_url, logo_url)
    )

# Print results
for idx, exp in enumerate(all_experiences):
    print(f"====== EXPERIENCE #{idx+1} ======")
    print("Company:", exp['company'])
    print("Company URL:", exp['company_url'])
    print("Logo URL:", exp['logo_url'])
    print("Title:", exp['title'])
    print("From Date:", exp['from_date'])
    print("To Date:", exp['to_date'])
    print("Dates (raw):", exp['dates'])
    print("Location:", exp['location'])
    print("Description:", exp['description'])
    print("Skills:", exp['skills'])
    print()


Company: Voya Financial logo
Company URL: https://www.linkedin.com/company/3077431/
Logo URL: https://media.licdn.com/dms/image/v2/C4D0BAQEopHI6Hjie3g/company-logo_100_100/company-logo_100_100/0/1631364007955?e=1754524800&v=beta&t=0MRt4o14F8Gilqmakj6IKXHfRWMTsI1WL2_MLmmbClU
Title: Data Science Intern
From Date: May 2024
To Date: Aug 2024
Dates (raw): May 2024 - Aug 2024 · 4 mos
Location: Los Angeles, California, United States
Description: • Developed a Data Quality Checker app using Python, SQL, and Snowflake to ensure 24/7 business validations on financial data, identifying outliers for review.• Conducted in-depth data analysis on portfolio data using Power BI and Python, investigating anomalies.• Completed three certification courses on data modeling for Snowflake data analysis and Datavault 2.0.
Skills: 

Company: Herman Ostrow School of Dentistry of USC logo
Company URL: https://www.linkedin.com/company/15173434/
Logo URL: https://media.licdn.com/dms/image/v2/C4E0BAQGXUbwBxTEaGg/co