<a href="https://colab.research.google.com/github/danielbehargithub/LinkedIn_Salary/blob/main/Profile_Data_Without_Auth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from bs4 import BeautifulSoup
from google.colab import files
from re import sub


In [3]:
def process_experience_html(html_experience):
    """
    Process the HTML file for the Experience section.
    """
    soup_experience = BeautifulSoup(html_experience, "html.parser")
    # Initialize the output variable to collect results
    output = "Experience Section:\n\n"

    # Locate the experience section in the HTML using its aria-label
    experience_section = soup_experience.find("main", {"aria-label": "Experience"})

    # Find all job records within the experience section
    records = experience_section.find_all("li", class_="pvs-list__paged-list-item artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column")

    # Process each job record
    for record in records:
        # Check if the record contains sub-records (multiple roles in one company)
        sub_records = record.find_all("li", class_="pvs-list__paged-list-item pvs-list__item--one-column")

        # Handle a single position in one company
        if len(sub_records) == 0:
            # Extract job title
            job_name_container = record.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold") or \
                                record.find("div", class_="display-flex align-items-center mr1 t-bold")
            job_name = (
                job_name_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                if job_name_container else "N/A"
            )

            # Extract company name and job type
            job_place_and_type_container = record.find("span", class_="t-14 t-normal")
            if job_place_and_type_container:
                place_and_type = job_place_and_type_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                if "·" in place_and_type:
                    company_name, job_type = [part.strip() for part in place_and_type.split("·", 1)]
                else:
                    company_name, job_type = "N/A", place_and_type
            else:
                company_name, job_type = "N/A", "N/A"

            # Extract job duration and location
            job_info_container = record.find_all("span", class_="t-14 t-normal t-black--light")
            info = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in job_info_container if span.find("span", {"aria-hidden": "true"})]
            job_duration = info[0] if len(info) > 0 else "N/A"
            job_location = info[1] if len(info) > 1 else "N/A"

            # Extract additional content and skills
            additional_content_container = record.find_all("div", class_="display-flex align-items-center t-14 t-normal t-black")
            additional_content_and_skills = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in additional_content_container if span.find("span", {"aria-hidden": "true"})]
            additional_content = additional_content_and_skills[0] if len(additional_content_and_skills) > 0 else "N/A"

            skills = additional_content_and_skills[1] if len(additional_content_and_skills) > 1 else "N/A"
            skills = skills.replace("Skills:", "").strip()

            # Append results to the output in a clean format
            output += (
                f"Job Title     : {job_name}\n"
                f"Company       : {company_name}\n"
                f"Job Type      : {job_type}\n"
                f"Job Duration  : {job_duration}\n"
                f"Location      : {job_location}\n"
                f"Description   : {additional_content}\n"
                f"Skills        : {skills}\n"
                f"{'-' * 50}\n"
            )
        else:
            # Handle multiple roles in the same company
            title_container = record.find("div", class_="display-flex flex-row justify-space-between")
            company_name_container = title_container.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold") or \
                                    title_container.find("div", class_="display-flex align-items-center mr1 t-bold")
            company_name = (
                company_name_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                if company_name_container else "N/A"
            )
            duration_container = title_container.find("span", class_="t-14 t-normal")
            duration = duration_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)

            # Append company details to the output
            output += (
                f"Company       : {company_name}\n"
                f"Duration      : {duration}\n"
                f"{'-' * 50}\n"
            )

            # Process each sub-record (job role)
            for sub_record in sub_records:
                # Extract job title
                job_name_container = sub_record.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold") or \
                                    sub_record.find("div", class_="display-flex align-items-center mr1 t-bold")
                job_name = (
                    job_name_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                    if job_name_container else "N/A"
                )

                # Extract company name and job type
                job_place_and_type_container = sub_record.find("span", class_="t-14 t-normal")
                if job_place_and_type_container:
                    place_and_type = job_place_and_type_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                    if "·" in place_and_type:
                        company_name, job_type = [part.strip() for part in place_and_type.split("·", 1)]
                    else:
                        company_name, job_type = "N/A", place_and_type
                else:
                    company_name, job_type = "N/A", "N/A"

                # Extract job duration and location
                job_info_container = sub_record.find_all("span", class_="t-14 t-normal t-black--light")
                info = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in job_info_container if span.find("span", {"aria-hidden": "true"})]
                job_duration = info[0] if len(info) > 0 else "N/A"
                job_location = info[1] if len(info) > 1 else "N/A"

                # Extract additional content and skills
                additional_content_container = sub_record.find_all("div", class_="display-flex align-items-center t-14 t-normal t-black")
                if len(additional_content_container) == 1:
                  if "Skills:" in additional_content_container[0].get_text(strip=True):
                    additional_content_and_skills = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in additional_content_container]
                    skills = additional_content_and_skills[0]
                    skills = skills.replace("Skills:", "").strip()
                else:

                    additional_content_and_skills = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in additional_content_container if span.find("span", {"aria-hidden": "true"})]
                    additional_content = additional_content_and_skills[0] if len(additional_content_and_skills) > 0 else "N/A"

                    skills = additional_content_and_skills[1] if len(additional_content_and_skills) > 1 else "N/A"
                    skills = skills.replace("Skills:", "").strip()

                # Append job role details to the output in a clean format
                output += (
                    f"  Job Title    : {job_name}\n"
                    f"  Job Type     : {job_type}\n"
                    f"  Job Duration : {job_duration}\n"
                    f"  Location     : {job_location}\n"
                    f"  Description  : {additional_content}\n"
                    f"  Skills       : {skills}\n"
                    f"  {'-' * 50}\n"
                )

    return output

In [11]:
def process_education_html(html_education):
    """
    Process the HTML file for the Education section.
    """
    soup_education = BeautifulSoup(html_education, "html.parser")

    output = "Education Section:\n\n"


    # Locate the education section in the HTML
    education_section = soup_education.find("main", {"aria-label": "Education"})

    # Find all education records within the section
    education_records = education_section.find_all("div", {"data-view-name": "profile-component-entity"})

    # Process each education record
    for record in education_records:
        # Extract institution name
        institution_container = record.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold")
        institution_name = institution_container.find("span", {"aria-hidden": "true"}).get_text(strip=True) if institution_container else "N/A"
        output += f"Institution     : {institution_name}\n"

        # Extract date range
        date_container = record.find("span", class_="pvs-entity__caption-wrapper")
        date_range = date_container.get_text(strip=True) if date_container else "N/A"
        output += f"Date Range      : {date_range}\n"

        # Extract additional description (e.g., degree or field of study)
        description_container = record.find("span", class_="t-14 t-normal")
        description_text = description_container.find("span", {"aria-hidden": "true"}).get_text(strip=True) if description_container else "N/A"
        output += f"Description     : {description_text}\n"

        # Extract skills (if available)
        skills_container = record.find("div", class_="display-flex align-items-center t-14 t-normal t-black")
        skills = []
        if skills_container:
            skills_span = skills_container.find("span", {"aria-hidden": "true"})
            if skills_span:
                skills_text = skills_span.get_text(strip=True)
                if "Skills:" in skills_text:
                    skills_text = skills_text.replace("Skills:", "").strip()
                skills = [skill.strip() for skill in skills_text.split("·")]
        output += f"Skills          : {', '.join(skills) if skills else 'N/A'}\n"

        # Extract additional text (e.g., detailed explanations)
        additional_text_container = record.find("div", class_="inline-show-more-text--is-collapsed")
        additional_text = ""
        if additional_text_container:
            additional_span = additional_text_container.find("span", {"aria-hidden": "true"})
            if additional_span:
                additional_text = additional_span.get_text(" ", strip=True)
        output += f"Additional Text : {additional_text if additional_text else 'N/A'}\n"

        # Append a separator for readability
        output += "-" * 50 + "\n"


    return output


In [16]:
def process_skills_html(html_skills):
    """
    Process the HTML file for the Education section.
    """
    soup_skills = BeautifulSoup(html_skills, "html.parser")

    output = "Skills Section:\n\n"

    # Find all <a> tags with data-field="skill_page_skill_topic"
    skill_links = soup_skills.find_all("a", {"data-field": "skill_page_skill_topic"})

    # Extract skills
    skills = []
    for link in skill_links:
        # Find the <span> with aria-hidden="true" inside the link
        skill_span = link.find("span", {"aria-hidden": "true"})
        if skill_span:
            skill_text = skill_span.get_text(strip=True)
            skills.append(skill_text)

    unique_skills = sorted(set(skills))

    output += f"Unique skills: {', '.join(unique_skills)}\n"
    output += "-" * 50 + "\n"

    return output

In [13]:
def upload_and_process(section_name, process_function):
    """
    Helper function to upload a file, process it, and append the results to combined_output.
    """
    print(f"Please upload the {section_name} HTML file:")
    uploaded_files = files.upload()
    file_name, content = next(iter(uploaded_files.items()))

    print(f"Processing {section_name} file: {file_name}\n")
    output = process_function(content)

    # Append the section output to the combined output
    return output


In [None]:
!pip install playwright
!playwright install

In [23]:
from playwright.async_api import async_playwright
import asyncio

async def scrape_job_posting(job_url):
    """
    Scrape job posting details from a LinkedIn job URL and save to a file.
    """
    async with async_playwright() as p:
        # Launch browser in headless mode
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(user_agent="Mozilla/5.0")
        page = await context.new_page()

        # Navigate to the provided job URL
        print(f"Navigating to {job_url}...")
        await page.goto(job_url)
        await asyncio.sleep(5)  # Wait for the page to fully load

        # Save the page's HTML content for debugging purposes
        # html = await page.content()
        # with open("page_debug.html", "w", encoding="utf-8") as f:
        #     f.write(html)
        # print("Saved page content to page_debug.html")

        # Take a screenshot of the page for verification
        # await page.screenshot(path="screenshot.png", full_page=True)
        # print("Screenshot saved as screenshot.png")

        # Extract job details using selectors (adjust selectors as needed)
        try:
            job_title = await page.inner_text('h1.top-card-layout__title')
            company = await page.inner_text('a.topcard__org-name-link')
            location = await page.inner_text('span.topcard__flavor--bullet')
            description = await page.inner_text('div.show-more-less-html__markup')

            # Prepare job details for saving
            job_details = (
                f"Job Title    : {job_title}\n"
                f"Company      : {company}\n"
                f"Location     : {location}\n"
                f"Description  : {description}\n"
            )

            # Save job details to a text file
            output_file = "job_details.txt"
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(job_details)

            print(f"Job details saved to {output_file}")
            return output_file

        except Exception as e:
            print(f"An error occurred while scraping: {e}")
            return None

        finally:
            # Close the browser
            await browser.close()



In [None]:
# Initialize combined_output
combined_output = ""

# Process Experience HTML and update combined_output
experience_output = upload_and_process("Experience", process_experience_html)

# Process Education HTML and update combined_output
education_output = upload_and_process("Education", process_education_html)

# Process Skills HTML and update combined_output
skills_output = upload_and_process("Skills", process_skills_html)

combined_output += experience_output
combined_output += education_output
combined_output += skills_output

# Save the combined output to a single file
final_output_file = "_profile_data.txt"
with open(final_output_file, "w") as file:
    file.write(combined_output)

# Provide the combined output file for download
files.download(final_output_file)



In [None]:
#### moving to job data

# Request the URL from the user
job_url = input("Please enter the LinkedIn job URL: ").strip()

# Run the scraping function
output_file = await scrape_job_posting(job_url)

# Allow user to download the file
if output_file:
    from google.colab import files
    files.download(output_file)
