<a href="https://colab.research.google.com/github/danielbehargithub/LinkedIn_Salary/blob/main/Profile_Data_Without_Auth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from bs4 import BeautifulSoup
from google.colab import files
from re import sub


In [3]:
def process_experience_html(html_experience):
    """
    Process the HTML file for the Experience section.
    """
    soup_experience = BeautifulSoup(html_experience, "html.parser")
    # Initialize the output variable to collect results
    output = "Experience Section:\n\n"

    # Locate the experience section in the HTML using its aria-label
    experience_section = soup_experience.find("main", {"aria-label": "Experience"})

    # Find all job records within the experience section
    records = experience_section.find_all("li", class_="pvs-list__paged-list-item artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column")

    # Process each job record
    for record in records:
        # Check if the record contains sub-records (multiple roles in one company)
        sub_records = record.find_all("li", class_="pvs-list__paged-list-item pvs-list__item--one-column")

        # Handle a single position in one company
        if len(sub_records) == 0:
            # Extract job title
            job_name_container = record.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold") or \
                                record.find("div", class_="display-flex align-items-center mr1 t-bold")
            job_name = (
                job_name_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                if job_name_container else "N/A"
            )

            # Extract company name and job type
            job_place_and_type_container = record.find("span", class_="t-14 t-normal")
            if job_place_and_type_container:
                place_and_type = job_place_and_type_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                if "·" in place_and_type:
                    company_name, job_type = [part.strip() for part in place_and_type.split("·", 1)]
                else:
                    company_name, job_type = "N/A", place_and_type
            else:
                company_name, job_type = "N/A", "N/A"

            # Extract job duration and location
            job_info_container = record.find_all("span", class_="t-14 t-normal t-black--light")
            info = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in job_info_container if span.find("span", {"aria-hidden": "true"})]
            job_duration = info[0] if len(info) > 0 else "N/A"
            job_location = info[1] if len(info) > 1 else "N/A"

            # Extract additional content and skills
            additional_content_container = record.find_all("div", class_="display-flex align-items-center t-14 t-normal t-black")
            additional_content_and_skills = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in additional_content_container if span.find("span", {"aria-hidden": "true"})]
            additional_content = additional_content_and_skills[0] if len(additional_content_and_skills) > 0 else "N/A"

            skills = additional_content_and_skills[1] if len(additional_content_and_skills) > 1 else "N/A"
            skills = skills.replace("Skills:", "").strip()

            # Append results to the output in a clean format
            output += (
                f"Job Title     : {job_name}\n"
                f"Company       : {company_name}\n"
                f"Job Type      : {job_type}\n"
                f"Job Duration  : {job_duration}\n"
                f"Location      : {job_location}\n"
                f"Description   : {additional_content}\n"
                f"Skills        : {skills}\n"
                f"{'-' * 50}\n"
            )
        else:
            # Handle multiple roles in the same company
            title_container = record.find("div", class_="display-flex flex-row justify-space-between")
            company_name_container = title_container.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold") or \
                                    title_container.find("div", class_="display-flex align-items-center mr1 t-bold")
            company_name = (
                company_name_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                if company_name_container else "N/A"
            )
            duration_container = title_container.find("span", class_="t-14 t-normal")
            duration = duration_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)

            # Append company details to the output
            output += (
                f"Company       : {company_name}\n"
                f"Duration      : {duration}\n"
                f"{'-' * 50}\n"
            )

            # Process each sub-record (job role)
            for sub_record in sub_records:
                # Extract job title
                job_name_container = sub_record.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold") or \
                                    sub_record.find("div", class_="display-flex align-items-center mr1 t-bold")
                job_name = (
                    job_name_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                    if job_name_container else "N/A"
                )

                # Extract company name and job type
                job_place_and_type_container = sub_record.find("span", class_="t-14 t-normal")
                if job_place_and_type_container:
                    place_and_type = job_place_and_type_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                    if "·" in place_and_type:
                        company_name, job_type = [part.strip() for part in place_and_type.split("·", 1)]
                    else:
                        company_name, job_type = "N/A", place_and_type
                else:
                    company_name, job_type = "N/A", "N/A"

                # Extract job duration and location
                job_info_container = sub_record.find_all("span", class_="t-14 t-normal t-black--light")
                info = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in job_info_container if span.find("span", {"aria-hidden": "true"})]
                job_duration = info[0] if len(info) > 0 else "N/A"
                job_location = info[1] if len(info) > 1 else "N/A"

                # Extract additional content and skills
                additional_content_container = sub_record.find_all("div", class_="display-flex align-items-center t-14 t-normal t-black")
                if len(additional_content_container) == 1:
                  if "Skills:" in additional_content_container[0].get_text(strip=True):
                    additional_content_and_skills = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in additional_content_container]
                    skills = additional_content_and_skills[0]
                    skills = skills.replace("Skills:", "").strip()
                else:

                    additional_content_and_skills = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in additional_content_container if span.find("span", {"aria-hidden": "true"})]
                    additional_content = additional_content_and_skills[0] if len(additional_content_and_skills) > 0 else "N/A"

                    skills = additional_content_and_skills[1] if len(additional_content_and_skills) > 1 else "N/A"
                    skills = skills.replace("Skills:", "").strip()

                # Append job role details to the output in a clean format
                output += (
                    f"  Job Title    : {job_name}\n"
                    f"  Job Type     : {job_type}\n"
                    f"  Job Duration : {job_duration}\n"
                    f"  Location     : {job_location}\n"
                    f"  Description  : {additional_content}\n"
                    f"  Skills       : {skills}\n"
                    f"  {'-' * 50}\n"
                )

    return output

In [11]:
def process_education_html(html_education):
    """
    Process the HTML file for the Education section.
    """
    soup_education = BeautifulSoup(html_education, "html.parser")

    output = "Education Section:\n\n"


    # Locate the education section in the HTML
    education_section = soup_education.find("main", {"aria-label": "Education"})

    # Find all education records within the section
    education_records = education_section.find_all("div", {"data-view-name": "profile-component-entity"})

    # Process each education record
    for record in education_records:
        # Extract institution name
        institution_container = record.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold")
        institution_name = institution_container.find("span", {"aria-hidden": "true"}).get_text(strip=True) if institution_container else "N/A"
        output += f"Institution     : {institution_name}\n"

        # Extract date range
        date_container = record.find("span", class_="pvs-entity__caption-wrapper")
        date_range = date_container.get_text(strip=True) if date_container else "N/A"
        output += f"Date Range      : {date_range}\n"

        # Extract additional description (e.g., degree or field of study)
        description_container = record.find("span", class_="t-14 t-normal")
        description_text = description_container.find("span", {"aria-hidden": "true"}).get_text(strip=True) if description_container else "N/A"
        output += f"Description     : {description_text}\n"

        # Extract skills (if available)
        skills_container = record.find("div", class_="display-flex align-items-center t-14 t-normal t-black")
        skills = []
        if skills_container:
            skills_span = skills_container.find("span", {"aria-hidden": "true"})
            if skills_span:
                skills_text = skills_span.get_text(strip=True)
                if "Skills:" in skills_text:
                    skills_text = skills_text.replace("Skills:", "").strip()
                skills = [skill.strip() for skill in skills_text.split("·")]
        output += f"Skills          : {', '.join(skills) if skills else 'N/A'}\n"

        # Extract additional text (e.g., detailed explanations)
        additional_text_container = record.find("div", class_="inline-show-more-text--is-collapsed")
        additional_text = ""
        if additional_text_container:
            additional_span = additional_text_container.find("span", {"aria-hidden": "true"})
            if additional_span:
                additional_text = additional_span.get_text(" ", strip=True)
        output += f"Additional Text : {additional_text if additional_text else 'N/A'}\n"

        # Append a separator for readability
        output += "-" * 50 + "\n"


    return output


In [16]:
def process_skills_html(html_skills):
    """
    Process the HTML file for the Education section.
    """
    soup_skills = BeautifulSoup(html_skills, "html.parser")

    output = "Skills Section:\n\n"

    # Find all <a> tags with data-field="skill_page_skill_topic"
    skill_links = soup_skills.find_all("a", {"data-field": "skill_page_skill_topic"})

    # Extract skills
    skills = []
    for link in skill_links:
        # Find the <span> with aria-hidden="true" inside the link
        skill_span = link.find("span", {"aria-hidden": "true"})
        if skill_span:
            skill_text = skill_span.get_text(strip=True)
            skills.append(skill_text)

    unique_skills = sorted(set(skills))

    output += f"Unique skills: {', '.join(unique_skills)}\n"
    output += "-" * 50 + "\n"

    return output

In [13]:
def upload_and_process(section_name, process_function):
    """
    Helper function to upload a file, process it, and append the results to combined_output.
    """
    print(f"Please upload the {section_name} HTML file:")
    uploaded_files = files.upload()
    file_name, content = next(iter(uploaded_files.items()))

    print(f"Processing {section_name} file: {file_name}\n")
    output = process_function(content)

    # Append the section output to the combined output
    return output


In [15]:
# Initialize combined_output
combined_output = ""

# Process Experience HTML and update combined_output
experience_output = upload_and_process("Experience", process_experience_html)

# Process Education HTML and update combined_output
education_output = upload_and_process("Education", process_education_html)

# Process Skills HTML and update combined_output
skills_output = upload_and_process("Skills", process_skills_html)

combined_output += experience_output
combined_output += education_output
combined_output += skills_output

# Save the combined output to a single file
final_output_file = "_output.txt"
with open(final_output_file, "w") as file:
    file.write(combined_output)

print("\Output:")
print(combined_output)

# Provide the combined output file for download
files.download(final_output_file)


Please upload the Experience HTML file:


Saving experience.txt to experience (4).txt
Processing Experience file: experience (4).txt

Please upload the Education HTML file:


Saving education.txt to education (2).txt
Processing Education file: education (2).txt

Please upload the Skills HTML file:


Saving skills.txt to skills (2).txt
Processing Skills file: skills (2).txt

\Output:
Experience Section:

Company       : Israel Defense Forces
Duration      : 4 yrs
--------------------------------------------------
  Job Title    : Logistics Officer
  Job Type     : N/A
  Job Duration : Apr 2017 - Nov 2019 · 2 yrs 8 mos
  Location     : N/A
  Description  : N/A
  Skills       : N/A
  --------------------------------------------------
  Job Title    : Logistics Assistant
  Job Type     : N/A
  Job Duration : Dec 2015 - Apr 2017 · 1 yr 5 mos
  Location     : N/A
  Description  : N/A
  Skills       : N/A
  --------------------------------------------------
Education Section:

Institution     : Technion - Israel Institute of Technology
Date Range      : Oct 2020
Description     : N/A
Skills          : Statistical Data Analysis, Apache Spark, Java, Data Structures, PyTorch, Deep Learning, Django
Additional Text : N/A
--------------------------------------------------
Skills Section:


Ski

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Please enter your LinkedIn credentials

Then click on Runtime -> Run all

You will receive your LinkedIn data

In [None]:
# Enter url, email and password
user_url = "https://www.linkedin.com/in/daniel-behar-168647280/"
email = "my_email@gmail.com"
password = "my_pass"


In [None]:
!pip install playwright
!playwright install


In [None]:
import nest_asyncio
from playwright.async_api import async_playwright
import asyncio

async def scrape_user_profile(user_url, email, password):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)  # Running in headless mode
        page = await browser.new_page()

        print("Navigating to LinkedIn login page...")
        # Navigate to LinkedIn login page
        await page.goto("https://www.linkedin.com/login")
        await asyncio.sleep(3)

        # Perform login
        print("Logging into LinkedIn...")
        print("*" * 50)

        await page.fill('input[id="username"]', email)
        await page.fill('input[id="password"]', password)
        await page.click('button[type="submit"]')
        await asyncio.sleep(5)

        # Check for two-step verification
        if await page.is_visible('input[name="pin"]'):
            print("Two-step verification detected. Please enter the code sent to your email.")
            verification_code = input("Enter the verification code: ")
            await page.fill('input[name="pin"]', verification_code)
            await page.click('button[type="submit"]')
            await asyncio.sleep(3)

        # Relevant sections to scrape for the user
        profile_sections = [
            "details/education/",
            "details/skills/",
            "details/experience/"
        ]

        scraped_data = {}  # Dictionary to store HTML content of each section

        # מספר ניסיונות להורדת HTML
        MAX_RETRIES = 3

        # Loop through the list of profile URLs
        for section in profile_sections:
            url = f"{user_url}{section}"
            retries = 0

            while retries < MAX_RETRIES:
                try:
                    print(f"Navigating to section: {url} (Attempt {retries + 1})")

                    # Navigate to the section URL
                    await page.goto(url)
                    await asyncio.sleep(3)

                    # Get page title to verify if the correct page is loaded
                    page_title = await page.title()
                    section_name = section.strip("/").split("/")[-1]  # Extract section name
                    # print(f"Page title: {page_title}")
                    print(f"Section name: {section_name}")

                    # Verify if the page is correct
                    if section_name.capitalize() not in page_title:
                        print(f"Page {url} did not load correctly. Retrying...")
                        retries += 1
                        await asyncio.sleep(5)  # Wait a bit before retrying
                        continue  # Retry the same section

                    # Save the HTML content of the section page
                    html = await page.content()
                    scraped_data[section_name] = html  # Store HTML content in dictionary
                    print(f"HTML content for {section_name} saved in memory.")
                    print("*" * 50)

                    # Save the HTML file for debugging
                    # filename = f"{section_name}.html"
                    # with open(filename, "w", encoding="utf-8") as f:
                    #     f.write(html)
                    # print(f"Saved HTML for {url} as {section_name}")

                    # Take a screenshot of the section page
                    # screenshot_filename = f"{section_name}.png"
                    # await page.screenshot(path=screenshot_filename, full_page=True)
                    # print(f"Saved screenshot for {url} as {screenshot_filename}")

                    break  # If successful, exit the retry loop

                except Exception as e:
                    print(f"Failed to scrape {url}: {e}")
                    retries += 1
                    await asyncio.sleep(5)  # Wait a bit before retrying

            if retries == MAX_RETRIES:
                print(f"Failed to scrape {url} after {MAX_RETRIES} attempts. Skipping...")

        # Close the browser
        print("Closing the browser...")
        await browser.close()

        return scraped_data  # Return all scraped HTML as a dictionary



In [None]:
scraped_html_data = await scrape_user_profile(user_url, email, password)

from bs4 import BeautifulSoup

# Example HTML from the dictionary
html_skills = scraped_html_data.get("skills")
html_education = scraped_html_data.get("education")
html_experience = scraped_html_data.get("experience")

# Parse HTML
soup_skill = BeautifulSoup(html_skills, "html.parser")
soup_education = BeautifulSoup(html_education, "html.parser")
soup_experience = BeautifulSoup(html_experience, "html.parser")


Navigating to LinkedIn login page...
Logging into LinkedIn...
**************************************************
Navigating to section: https://www.linkedin.com/in/daniel-behar-168647280/details/education/ (Attempt 1)
Section name: education
HTML content for education saved in memory.
**************************************************
Navigating to section: https://www.linkedin.com/in/daniel-behar-168647280/details/skills/ (Attempt 1)
Section name: skills
HTML content for skills saved in memory.
**************************************************
Navigating to section: https://www.linkedin.com/in/daniel-behar-168647280/details/experience/ (Attempt 1)
Section name: experience
HTML content for experience saved in memory.
**************************************************
Closing the browser...


In [None]:
from re import sub

# Initialize the output variable to collect results
output = "Experience Section:\n\n"

# Locate the experience section in the HTML using its aria-label
experience_section = soup_experience.find("main", {"aria-label": "Experience"})

# Find all job records within the experience section
records = experience_section.find_all("li", class_="pvs-list__paged-list-item artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column")

# Process each job record
for record in records:
    # Check if the record contains sub-records (multiple roles in one company)
    sub_records = record.find_all("li", class_="pvs-list__paged-list-item pvs-list__item--one-column")

    # Handle a single position in one company
    if len(sub_records) == 0:
        # Extract job title
        job_name_container = record.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold") or \
                             record.find("div", class_="display-flex align-items-center mr1 t-bold")
        job_name = (
            job_name_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
            if job_name_container else "N/A"
        )

        # Extract company name and job type
        job_place_and_type_container = record.find("span", class_="t-14 t-normal")
        if job_place_and_type_container:
            place_and_type = job_place_and_type_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
            if "·" in place_and_type:
                company_name, job_type = [part.strip() for part in place_and_type.split("·", 1)]
            else:
                company_name, job_type = "N/A", place_and_type
        else:
            company_name, job_type = "N/A", "N/A"

        # Extract job duration and location
        job_info_container = record.find_all("span", class_="t-14 t-normal t-black--light")
        info = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in job_info_container if span.find("span", {"aria-hidden": "true"})]
        job_duration = info[0] if len(info) > 0 else "N/A"
        job_location = info[1] if len(info) > 1 else "N/A"

        # Extract additional content and skills
        additional_content_container = record.find_all("div", class_="display-flex align-items-center t-14 t-normal t-black")
        additional_content_and_skills = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in additional_content_container if span.find("span", {"aria-hidden": "true"})]
        additional_content = additional_content_and_skills[0] if len(additional_content_and_skills) > 0 else "N/A"

        skills = additional_content_and_skills[1] if len(additional_content_and_skills) > 1 else "N/A"
        skills = skills.replace("Skills:", "").strip()

        # Append results to the output in a clean format
        output += (
            f"Job Title     : {job_name}\n"
            f"Company       : {company_name}\n"
            f"Job Type      : {job_type}\n"
            f"Job Duration  : {job_duration}\n"
            f"Location      : {job_location}\n"
            f"Description   : {additional_content}\n"
            f"Skills        : {skills}\n"
            f"{'-' * 50}\n"
        )
    else:
        # Handle multiple roles in the same company
        title_container = record.find("div", class_="display-flex flex-row justify-space-between")
        company_name_container = title_container.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold") or \
                                 title_container.find("div", class_="display-flex align-items-center mr1 t-bold")
        company_name = (
            company_name_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
            if company_name_container else "N/A"
        )
        duration_container = title_container.find("span", class_="t-14 t-normal")
        duration = duration_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)

        # Append company details to the output
        output += (
            f"Company       : {company_name}\n"
            f"Duration      : {duration}\n"
            f"{'-' * 50}\n"
        )

        # Process each sub-record (job role)
        for sub_record in sub_records:
            # Extract job title
            job_name_container = sub_record.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold") or \
                                 sub_record.find("div", class_="display-flex align-items-center mr1 t-bold")
            job_name = (
                job_name_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                if job_name_container else "N/A"
            )

            # Extract company name and job type
            job_place_and_type_container = sub_record.find("span", class_="t-14 t-normal")
            if job_place_and_type_container:
                place_and_type = job_place_and_type_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                if "·" in place_and_type:
                    company_name, job_type = [part.strip() for part in place_and_type.split("·", 1)]
                else:
                    company_name, job_type = "N/A", place_and_type
            else:
                company_name, job_type = "N/A", "N/A"

            # Extract job duration and location
            job_info_container = sub_record.find_all("span", class_="t-14 t-normal t-black--light")
            info = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in job_info_container if span.find("span", {"aria-hidden": "true"})]
            job_duration = info[0] if len(info) > 0 else "N/A"
            job_location = info[1] if len(info) > 1 else "N/A"

            # Extract additional content and skills
            additional_content_container = sub_record.find_all("div", class_="display-flex align-items-center t-14 t-normal t-black")
            if len(additional_content_container) == 1:
              if "Skills:" in additional_content_container[0].get_text(strip=True):
                additional_content_and_skills = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in additional_content_container]
                skills = additional_content_and_skills[0]
                skills = skills.replace("Skills:", "").strip()
            else:

                additional_content_and_skills = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in additional_content_container if span.find("span", {"aria-hidden": "true"})]
                additional_content = additional_content_and_skills[0] if len(additional_content_and_skills) > 0 else "N/A"

                skills = additional_content_and_skills[1] if len(additional_content_and_skills) > 1 else "N/A"
                skills = skills.replace("Skills:", "").strip()

            # Append job role details to the output in a clean format
            output += (
                f"  Job Title    : {job_name}\n"
                f"  Job Type     : {job_type}\n"
                f"  Job Duration : {job_duration}\n"
                f"  Location     : {job_location}\n"
                f"  Description  : {additional_content}\n"
                f"  Skills       : {skills}\n"
                f"  {'-' * 50}\n"
            )


In [None]:
# Append the education section header to the output
output += "\nEducation Section:\n\n"

# Locate the education section in the HTML
education_section = soup_education.find("main", {"aria-label": "Education"})

# Find all education records within the section
education_records = education_section.find_all("div", {"data-view-name": "profile-component-entity"})

# Process each education record
for record in education_records:
    # Extract institution name
    institution_container = record.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold")
    institution_name = institution_container.find("span", {"aria-hidden": "true"}).get_text(strip=True) if institution_container else "N/A"
    output += f"Institution     : {institution_name}\n"

    # Extract date range
    date_container = record.find("span", class_="pvs-entity__caption-wrapper")
    date_range = date_container.get_text(strip=True) if date_container else "N/A"
    output += f"Date Range      : {date_range}\n"

    # Extract additional description (e.g., degree or field of study)
    description_container = record.find("span", class_="t-14 t-normal")
    description_text = description_container.find("span", {"aria-hidden": "true"}).get_text(strip=True) if description_container else "N/A"
    output += f"Description     : {description_text}\n"

    # Extract skills (if available)
    skills_container = record.find("div", class_="display-flex align-items-center t-14 t-normal t-black")
    skills = []
    if skills_container:
        skills_span = skills_container.find("span", {"aria-hidden": "true"})
        if skills_span:
            skills_text = skills_span.get_text(strip=True)
            if "Skills:" in skills_text:
                skills_text = skills_text.replace("Skills:", "").strip()
            skills = [skill.strip() for skill in skills_text.split("·")]
    output += f"Skills          : {', '.join(skills) if skills else 'N/A'}\n"

    # Extract additional text (e.g., detailed explanations)
    additional_text_container = record.find("div", class_="inline-show-more-text--is-collapsed")
    additional_text = ""
    if additional_text_container:
        additional_span = additional_text_container.find("span", {"aria-hidden": "true"})
        if additional_span:
            additional_text = additional_span.get_text(" ", strip=True)
    output += f"Additional Text : {additional_text if additional_text else 'N/A'}\n"

    # Append a separator for readability
    output += "-" * 50 + "\n"



In [None]:
# Find all <a> tags with data-field="skill_page_skill_topic"
skill_links = soup_skill.find_all("a", {"data-field": "skill_page_skill_topic"})

# Extract skills
skills = []
for link in skill_links:
    # Find the <span> with aria-hidden="true" inside the link
    skill_span = link.find("span", {"aria-hidden": "true"})
    if skill_span:
        skill_text = skill_span.get_text(strip=True)
        skills.append(skill_text)

unique_skills = sorted(set(skills))

output += "\nSkills Section:\n"
output += f"Unique skills: {', '.join(unique_skills)}\n"
output += "-" * 50 + "\n"



In [None]:
# Save the output to a text file
file_name = "profile_data.txt"

# Write the output content to the file
with open(file_name, "w") as file:
    file.write(output)

# Provide a link to download the file
from google.colab import files
files.download(file_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>