<a href="https://colab.research.google.com/github/danielbehargithub/LinkedIn_Salary/blob/main/Profile_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Then click on Runtime -> Run all.

Please enter your LinkedIn credentials and job url.

You will receive your LinkedIn and job data.

For full guide go to README file.

Example:

user url: https://www.linkedin.com/in/profile_id/

email: my_email@gmail.com

password: my_password

job url: https://www.linkedin.com/jobs/view/job_number/

In [None]:
from getpass import getpass


# Enter url, email and password
user_url = input("Please enter full LinkedIn profile url: ").strip()
email = input("Please enter LinkedIn email: ").strip()
password = getpass("Please enter your LinkedIn password: ")
print("Password received securely.")
job_url = input("Please enter the LinkedIn job URL: ").strip()


In [None]:
!pip install playwright
!playwright install


In [3]:
import nest_asyncio
from playwright.async_api import async_playwright
import asyncio

async def scrape_user_profile(user_url, email, password):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)  # Running in headless mode
        page = await browser.new_page()

        print("Navigating to LinkedIn login page...")
        # Navigate to LinkedIn login page
        await page.goto("https://www.linkedin.com/login")
        await asyncio.sleep(3)

        # Perform login
        print("Logging into LinkedIn...")
        print("*" * 50)

        await page.fill('input[id="username"]', email)
        await page.fill('input[id="password"]', password)
        await page.click('button[type="submit"]')
        await asyncio.sleep(5)

        # Check for two-step verification
        if await page.is_visible('input[name="pin"]'):
            print("Two-step verification detected. Please enter the code sent to your email.")
            verification_code = input("Enter the verification code: ")
            await page.fill('input[name="pin"]', verification_code)
            await page.click('button[type="submit"]')
            await asyncio.sleep(3)

        # Relevant sections to scrape for the user
        profile_sections = [
            "details/education/",
            "details/skills/",
            "details/experience/"
        ]

        scraped_data = {}  # Dictionary to store HTML content of each section

        MAX_RETRIES = 3

        # Loop through the list of profile URLs
        for section in profile_sections:
            url = f"{user_url}{section}"
            retries = 0

            while retries < MAX_RETRIES:
                try:
                    print(f"Navigating to section: {url} (Attempt {retries + 1})")

                    # Navigate to the section URL
                    await page.goto(url)
                    await asyncio.sleep(3)

                    # Get page title to verify if the correct page is loaded
                    page_title = await page.title()
                    section_name = section.strip("/").split("/")[-1]  # Extract section name
                    # print(f"Page title: {page_title}")
                    print(f"Section name: {section_name}")

                    # Verify if the page is correct
                    if section_name.capitalize() not in page_title:
                        print(f"Page {url} did not load correctly. Retrying...")
                        retries += 1
                        await asyncio.sleep(5)  # Wait a bit before retrying
                        continue  # Retry the same section

                    # Save the HTML content of the section page
                    html = await page.content()
                    scraped_data[section_name] = html  # Store HTML content in dictionary
                    print(f"HTML content for {section_name} saved in memory.")
                    print("*" * 50)

                    # Save the HTML file for debugging
                    # filename = f"{section_name}.html"
                    # with open(filename, "w", encoding="utf-8") as f:
                    #     f.write(html)
                    # print(f"Saved HTML for {url} as {section_name}")

                    # Take a screenshot of the section page
                    # screenshot_filename = f"{section_name}.png"
                    # await page.screenshot(path=screenshot_filename, full_page=True)
                    # print(f"Saved screenshot for {url} as {screenshot_filename}")

                    break  # If successful, exit the retry loop

                except Exception as e:
                    print(f"Failed to scrape {url}: {e}")
                    retries += 1
                    await asyncio.sleep(5)  # Wait a bit before retrying

            if retries == MAX_RETRIES:
                print(f"Failed to scrape {url} after {MAX_RETRIES} attempts. Skipping...")

        # Close the browser
        print("Closing the browser...")
        await browser.close()

        return scraped_data  # Return all scraped HTML as a dictionary



In [4]:
scraped_html_data = await scrape_user_profile(user_url, email, password)

from bs4 import BeautifulSoup

# Example HTML from the dictionary
html_skills = scraped_html_data.get("skills")
html_education = scraped_html_data.get("education")
html_experience = scraped_html_data.get("experience")

# Parse HTML
soup_skill = BeautifulSoup(html_skills, "html.parser")
soup_education = BeautifulSoup(html_education, "html.parser")
soup_experience = BeautifulSoup(html_experience, "html.parser")


Navigating to LinkedIn login page...
Logging into LinkedIn...
**************************************************
Navigating to section: https://www.linkedin.com/in/daniel-behar-168647280/details/education/ (Attempt 1)
Section name: education
HTML content for education saved in memory.
**************************************************
Navigating to section: https://www.linkedin.com/in/daniel-behar-168647280/details/skills/ (Attempt 1)
Section name: skills
HTML content for skills saved in memory.
**************************************************
Navigating to section: https://www.linkedin.com/in/daniel-behar-168647280/details/experience/ (Attempt 1)
Section name: experience
HTML content for experience saved in memory.
**************************************************
Closing the browser...


In [5]:
from re import sub

# Initialize the output variable to collect results
output = "Experience Section:\n\n"

# Locate the experience section in the HTML using its aria-label
experience_section = soup_experience.find("main", {"aria-label": "Experience"})

# Find all job records within the experience section
records = experience_section.find_all("li", class_="pvs-list__paged-list-item artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column")

# Process each job record
for record in records:
    # Check if the record contains sub-records (multiple roles in one company)
    sub_records = record.find_all("li", class_="pvs-list__paged-list-item pvs-list__item--one-column")

    # Handle a single position in one company
    if len(sub_records) == 0:
        # Extract job title
        job_name_container = record.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold") or \
                             record.find("div", class_="display-flex align-items-center mr1 t-bold")
        job_name = (
            job_name_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
            if job_name_container else "N/A"
        )

        # Extract company name and job type
        job_place_and_type_container = record.find("span", class_="t-14 t-normal")
        if job_place_and_type_container:
            place_and_type = job_place_and_type_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
            if "·" in place_and_type:
                company_name, job_type = [part.strip() for part in place_and_type.split("·", 1)]
            else:
                company_name, job_type = "N/A", place_and_type
        else:
            company_name, job_type = "N/A", "N/A"

        # Extract job duration and location
        job_info_container = record.find_all("span", class_="t-14 t-normal t-black--light")
        info = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in job_info_container if span.find("span", {"aria-hidden": "true"})]
        job_duration = info[0] if len(info) > 0 else "N/A"
        job_location = info[1] if len(info) > 1 else "N/A"

        # Extract additional content and skills
        additional_content_container = record.find_all("div", class_="display-flex align-items-center t-14 t-normal t-black")
        additional_content_and_skills = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in additional_content_container if span.find("span", {"aria-hidden": "true"})]
        additional_content = additional_content_and_skills[0] if len(additional_content_and_skills) > 0 else "N/A"

        skills = additional_content_and_skills[1] if len(additional_content_and_skills) > 1 else "N/A"
        skills = skills.replace("Skills:", "").strip()

        # Append results to the output in a clean format
        output += (
            f"Job Title     : {job_name}\n"
            f"Company       : {company_name}\n"
            f"Job Type      : {job_type}\n"
            f"Job Duration  : {job_duration}\n"
            f"Location      : {job_location}\n"
            f"Description   : {additional_content}\n"
            f"Skills        : {skills}\n"
            f"{'-' * 50}\n"
        )
    else:
        # Handle multiple roles in the same company
        title_container = record.find("div", class_="display-flex flex-row justify-space-between")
        company_name_container = title_container.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold") or \
                                 title_container.find("div", class_="display-flex align-items-center mr1 t-bold")
        company_name = (
            company_name_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
            if company_name_container else "N/A"
        )
        duration_container = title_container.find("span", class_="t-14 t-normal")
        duration = duration_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)

        # Append company details to the output
        output += (
            f"Company       : {company_name}\n"
            f"Duration      : {duration}\n"
            f"{'-' * 50}\n"
        )

        # Process each sub-record (job role)
        for sub_record in sub_records:
            # Extract job title
            job_name_container = sub_record.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold") or \
                                 sub_record.find("div", class_="display-flex align-items-center mr1 t-bold")
            job_name = (
                job_name_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                if job_name_container else "N/A"
            )

            # Extract company name and job type
            job_place_and_type_container = sub_record.find("span", class_="t-14 t-normal")
            if job_place_and_type_container:
                place_and_type = job_place_and_type_container.find("span", {"aria-hidden": "true"}).get_text(strip=True)
                if "·" in place_and_type:
                    company_name, job_type = [part.strip() for part in place_and_type.split("·", 1)]
                else:
                    company_name, job_type = "N/A", place_and_type
            else:
                company_name, job_type = "N/A", "N/A"

            # Extract job duration and location
            job_info_container = sub_record.find_all("span", class_="t-14 t-normal t-black--light")
            info = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in job_info_container if span.find("span", {"aria-hidden": "true"})]
            job_duration = info[0] if len(info) > 0 else "N/A"
            job_location = info[1] if len(info) > 1 else "N/A"

            # Extract additional content and skills
            additional_content_container = sub_record.find_all("div", class_="display-flex align-items-center t-14 t-normal t-black")
            if len(additional_content_container) == 1:
              if "Skills:" in additional_content_container[0].get_text(strip=True):
                additional_content_and_skills = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in additional_content_container]
                skills = additional_content_and_skills[0]
                skills = skills.replace("Skills:", "").strip()
            else:

                additional_content_and_skills = [span.find("span", {"aria-hidden": "true"}).get_text(strip=True) for span in additional_content_container if span.find("span", {"aria-hidden": "true"})]
                additional_content = additional_content_and_skills[0] if len(additional_content_and_skills) > 0 else "N/A"

                skills = additional_content_and_skills[1] if len(additional_content_and_skills) > 1 else "N/A"
                skills = skills.replace("Skills:", "").strip()

            # Append job role details to the output in a clean format
            output += (
                f"  Job Title    : {job_name}\n"
                f"  Job Type     : {job_type}\n"
                f"  Job Duration : {job_duration}\n"
                f"  Location     : {job_location}\n"
                f"  Description  : {additional_content}\n"
                f"  Skills       : {skills}\n"
                f"  {'-' * 50}\n"
            )


In [6]:
# Append the education section header to the output
output += "\nEducation Section:\n\n"

# Locate the education section in the HTML
education_section = soup_education.find("main", {"aria-label": "Education"})

# Find all education records within the section
education_records = education_section.find_all("div", {"data-view-name": "profile-component-entity"})

# Process each education record
for record in education_records:
    # Extract institution name
    institution_container = record.find("div", class_="display-flex align-items-center mr1 hoverable-link-text t-bold")
    institution_name = institution_container.find("span", {"aria-hidden": "true"}).get_text(strip=True) if institution_container else "N/A"
    output += f"Institution     : {institution_name}\n"

    # Extract date range
    date_container = record.find("span", class_="pvs-entity__caption-wrapper")
    date_range = date_container.get_text(strip=True) if date_container else "N/A"
    output += f"Date Range      : {date_range}\n"

    # Extract additional description (e.g., degree or field of study)
    description_container = record.find("span", class_="t-14 t-normal")
    description_text = description_container.find("span", {"aria-hidden": "true"}).get_text(strip=True) if description_container else "N/A"
    output += f"Description     : {description_text}\n"

    # Extract skills (if available)
    skills_container = record.find("div", class_="display-flex align-items-center t-14 t-normal t-black")
    skills = []
    if skills_container:
        skills_span = skills_container.find("span", {"aria-hidden": "true"})
        if skills_span:
            skills_text = skills_span.get_text(strip=True)
            if "Skills:" in skills_text:
                skills_text = skills_text.replace("Skills:", "").strip()
            skills = [skill.strip() for skill in skills_text.split("·")]
    output += f"Skills          : {', '.join(skills) if skills else 'N/A'}\n"

    # Extract additional text (e.g., detailed explanations)
    additional_text_container = record.find("div", class_="inline-show-more-text--is-collapsed")
    additional_text = ""
    if additional_text_container:
        additional_span = additional_text_container.find("span", {"aria-hidden": "true"})
        if additional_span:
            additional_text = additional_span.get_text(" ", strip=True)
    output += f"Additional Text : {additional_text if additional_text else 'N/A'}\n"

    # Append a separator for readability
    output += "-" * 50 + "\n"



In [7]:
# Find all <a> tags with data-field="skill_page_skill_topic"
skill_links = soup_skill.find_all("a", {"data-field": "skill_page_skill_topic"})

# Extract skills
skills = []
for link in skill_links:
    # Find the <span> with aria-hidden="true" inside the link
    skill_span = link.find("span", {"aria-hidden": "true"})
    if skill_span:
        skill_text = skill_span.get_text(strip=True)
        skills.append(skill_text)

unique_skills = sorted(set(skills))

output += "\nSkills Section:\n"
output += f"Unique skills: {', '.join(unique_skills)}\n"
output += "-" * 50 + "\n"



In [8]:
# Save the output to a text file
file_name = "profile_data.txt"

# Write the output content to the file
with open(file_name, "w") as file:
    file.write(output)

# Provide a link to download the file
from google.colab import files
files.download(file_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
from playwright.async_api import async_playwright
import asyncio

async def scrape_job_posting(job_url):
    """
    Scrape job posting details from a LinkedIn job URL and save to a file.
    """
    async with async_playwright() as p:
        # Launch browser in headless mode
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(user_agent="Mozilla/5.0")
        page = await context.new_page()

        # Navigate to the provided job URL
        print(f"Navigating to {job_url}...")
        await page.goto(job_url)
        await asyncio.sleep(5)  # Wait for the page to fully load

        # Save the page's HTML content for debugging purposes
        # html = await page.content()
        # with open("page_debug.html", "w", encoding="utf-8") as f:
        #     f.write(html)
        # print("Saved page content to page_debug.html")

        # Take a screenshot of the page for verification
        # await page.screenshot(path="screenshot.png", full_page=True)
        # print("Screenshot saved as screenshot.png")

        # Extract job details using selectors (adjust selectors as needed)
        try:
            job_title = await page.inner_text('h1.top-card-layout__title')
            company = await page.inner_text('a.topcard__org-name-link')
            location = await page.inner_text('span.topcard__flavor--bullet')
            description = await page.inner_text('div.show-more-less-html__markup')

            # Prepare job details for saving
            job_details = (
                f"Job Title    : {job_title}\n"
                f"Company      : {company}\n"
                f"Location     : {location}\n"
                f"Description  : {description}\n"
            )

            # Save job details to a text file
            output_file = "job_details.txt"
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(job_details)

            print(f"Job details saved to {output_file}")
            return output_file

        except Exception as e:
            print(f"An error occurred while scraping: {e}")
            return None

        finally:
            # Close the browser
            await browser.close()



In [10]:
# Run the scraping function
output_file = await scrape_job_posting(job_url)

# Allow user to download the file
if output_file:
    from google.colab import files
    files.download(output_file)


Navigating to https://www.linkedin.com/jobs/view/4114601385/...
Job details saved to job_details.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>