In [1]:
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd
import re
from IPython.display import display, HTML

In [2]:
# Title of job in search
title = "Data Scientist"

# Location
location = "Los Angeles County"

In [3]:
# link for list of job postings on the left on linkedin
list_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%2Bscientist&location=los%2Bangeles%2Bcounty&geoId=&trk=public_jobs_jobs-search-bar_search-submit&start=25"

response = requests.get(list_url)

list_data = response.text
list_soup = BeautifulSoup(list_data, "html.parser")
page_jobs = list_soup.findAll("li")

  page_jobs = list_soup.findAll("li")


In [4]:
job_id_list = []

for job in page_jobs:
    # find job_ids
    base_card_div = job.find("div", {"class": "base-card"})
    job_id = base_card_div.get("data-entity-urn").split(":")[3]
    job_id_list.append(job_id)
    

In [5]:
def clean_description(description):
    if description:
        description = description.text
        # Split by line breaks to preserve bullet points
        clean_desc = ' '.join(description.splitlines())  
        clean_desc = clean_desc.lower()

        return clean_desc


In [29]:
job_list = []
for job_id in job_id_list:
    job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
    job_response = requests.get(job_url)
    job_soup = BeautifulSoup(job_response.text, "html.parser")

    # Dictionary for each job posting
    job_post = {}
    
    # Clean description used to parse for fields
    raw_description = job_soup.find("div", {"class": "description__text description__text--rich"})
    cleaned_desc = clean_description(raw_description)

    # company name
    try:
        job_post["company_name"] = job_soup.find("a",{"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
    except:
        job_post["company_name"] = None
    
    # company location
    try: 
        job_post["location"] = job_soup.find("span", {"class": "topcard__flavor topcard__flavor--bullet"}).text.strip()
    except:
        job_post["location"] = None
    
    # Role Title
    try:
        job_post["title"] = job_soup.find("h2", {"class":"top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title"}).text.strip()
    except:
        job_post["title"] = None

    # Seniority Level
    try:
        level_header = job_soup.find("h3", {"class": "description__job-criteria-subheader"})
        sen_level = level_header.find_next_sibling("span").text.strip()
        job_post["seniority_level"] = sen_level
    except:
        job_post["seniority_level"] = None
        
    # Employment Type (Fulltime, Part-time, internship)
    try:
        level_header = job_soup.find("h3", {"class": "description__job-criteria-subheader"})
        emp_header = level_header.find_next("h3", {"class": "description__job-criteria-subheader"})
        emp_type = emp_header.find_next_sibling("span").text.strip()
        job_post["employment_type"] = emp_type
        
    except:
        job_post["job_type"] = "Job type not specified"
    
    # Years of Experience
    try:
        pattern = r'(\d+)(?:\+|\-?\d*)\s*years?'
        matches = re.findall(pattern, cleaned_desc)
        if matches:
            numbers = [match for match in matches]
            if min(numbers) == '18':
                job_post["yoe"] = None
            else:
                job_post["yoe"] = min(numbers)
        else:
            job_post["yoe"] = None
    except:
        job_post["yoe"] = None

    # Degree Required
    try:
        pattern = r"(?i)\b(bachelor[’']?s|master[’']?s|ph\.?d|doctorate|b\.?s\.?|m\.?s\.)\b"
        edu = re.findall(pattern, cleaned_desc)

        degree_map = {"bs": "bachelor's", "ms": "master's", "b.s": "bachelor's", "m.s": "master's"}
        formatted_matches = [degree_map.get(match, match) for match in edu]

        if len(formatted_matches) > 1:
            result = " or ".join(formatted_matches)
        else:
            result = formatted_matches[0] if formatted_matches else "No degree found"
        
        job_post["education"] = result
    except:
        job_post["education"] = None

    # Salary
    try:
        job_post["salary"] = job_soup.find("div", {"class": "salary compensation__salary"}).text.strip()
    
    except AttributeError:
        pattern = r"\$[\d,]+(?:\.\d{2})?\s?-\s?\$[\d,]+(?:\.\d{2})?"
        salary_range = re.findall(pattern, cleaned_desc)
        salary_string = " ".join(salary_range) if salary_range else "No salary found"
        job_post["salary"] = salary_string

    # Programming Languages (Python, R, SQL, Java, Scala)
    languages = ['python', 'r', 'sql', 'java', 'scala', 'c++', 'julia', 'matlab', 'dax', 'vba', 'mdx', 'javascript', 'typescript', 'bash', 'shell']

    try:
        pattern = r'\b(?:' + '|'.join(re.escape(lang) for lang in languages) + r')\b'
        matches = re.findall(pattern, cleaned_desc)
        matches = list(set(matches))
        if len(matches) == 0:
            job_post["programming_languages"] = None
        else:
            job_post["programming_languages"] = matches

    
    except:
        job_post["programming_languages"] = None


    # ML Skills (TensorFlow, PyTorch, Scikit-learn, Keras)
    ml_ai_skills = ['tensorflow', 'pytorch', 'scikit-learn', 'keras', 'xgboost', 'lightgbm', 'catboost', 'fastai', 'mlflow', 'onnx', 'apache mxnet', 
                'hugging face transformers', 'nltk', 'spacy', 'gensim', 'faiss', 'sentence-transformers', 'h2o.ai', 'deepchem', 
                'supervised learning', 'unsupervised learning', 'reinforcement learning', 'graph neural networks', 'computer vision', 
                'nlp', 'bayesian networks', 'logistic regression', 'linear regression', 'decision trees', 'random forest', 'gradient boosting', 
                'support vector machines', 'k-means clustering', 'hierarchical clustering', 'gaussian mixture models', 'hidden markov models', 
                'principal component analysis', 'singular value decomposition', 'time-series forecasting', 'anomaly detection', 'jax', 'genai', 'generativeai']

    try:
        
        pattern = r'\b(?:' + '|'.join(re.escape(skill) for skill in ml_ai_skills) + r')\b'
        matches = re.findall(pattern, cleaned_desc)
        matches = list(set(matches))
        if len(matches) == 0:
            job_post["ml_skills"] = None
        else:
            job_post["ml_skills"] = matches

    
    except:
        job_post["ml_skills"] = None

    # Data Processing & Databases (ETL, Pandas, Spark, Hadoop, Snowflake)
    data_processing_databases = ['etl', 'pandas', 'spark', 'hadoop', 'snowflake', 'airflow', 'dbt', 'kafka', 'redshift', 'bigquery', 'presto', 
                             'trino', 'hive', 'pig', 'databricks', 'delta lake', 'iceberg', 'hudi', 'cassandra', 'mongodb', 'couchdb', 
                             'neo4j', 'arangodb', 'firebase', 'postgresql', 'mysql', 'mariadb', 'sqlite', 'oracle', 'sql server', 
                             'teradata', 'vertica', 'clickhouse', 'greenplum', 'impala', 'exasol', 'druid', 'tidb', 'scyllaDB', 'rockset', 
                             'elasticsearch', 'splunk', 'opensearch', 'flink', 'beam', 'storm', 'dask', 'modin', 'ray[data]', 
                             'polars', 'duckdb', 'voltdb', 'timestream', 'timescaledb', 'influxdb', 'prometheus', 'graphdb', 
                             'yugabyte', 'foundationdb', 'faunadb', 'cosmos db', 'dynamodb']

    try:
        
        pattern = r'\b(?:' + '|'.join(re.escape(skill) for skill in data_processing_databases) + r')\b'
        matches = re.findall(pattern, cleaned_desc)
        matches = list(set(matches))
        if len(matches) == 0:
            job_post["data_processing_db"] = None
        else:
            job_post["data_processing_db"] = matches

    
    except:
        job_post["data_processing_db"] = None


    # Cloud Platforms (AWS, GCP, Azure, OCI)
    cloud_devops_platforms = ['aws', 'gcp', 'azure', 'oci', 'databricks', 'kubernetes', 'docker', 'terraform', 'cloudformation', 'ansible', 
                          'lambda', 'cloud run', 'sagemaker', 'vertex ai', 'azure ml', 'redshift spectrum', 'biglake', 'synapse', 
                          'athena', 'lake formation', 'snowpark', 'gitlab ci/cd', 'github actions', 'jenkins', 'circleci', 'argo workflows', 
                          'kubeflow', 'mlflow', 'airflow', 'dagster', 'prefect', 'kedro', 'bentoml', 'seldon', 'ray[serve]']

    try:
        
        pattern = r'\b(?:' + '|'.join(re.escape(skill) for skill in cloud_devops_platforms) + r')\b'
        matches = re.findall(pattern, cleaned_desc)
        matches = list(set(matches))
        if len(matches) == 0:
            job_post["cloud"] = None
        else:
            job_post["cloud"] = matches

    
    except:
        job_post["cloud"] = None

    # Visualization Tools (Tableau, Power BI, D3.js)
    visualization_tools = ['tableau', 'power bi', 'd3.js', 'looker', 'superset', 'metabase', 'plotly', 'matplotlib', 'seaborn', 'ggplot2', 
                       'altair', 'vega', 'dash', 'streamlit', 'shiny', 'holoviews', 'bokeh', 'oac']

    try:
        
        pattern = r'\b(?:' + '|'.join(re.escape(skill) for skill in visualization_tools) + r')\b'
        matches = re.findall(pattern, cleaned_desc)
        matches = list(set(matches))
        if len(matches) == 0:
            job_post["visualization"] = None
        else:
            job_post["visualization"] = matches

    
    except:
        job_post["data_processing_db"] = None
    # Big Data Tools (Airflow, Kafka, Kubernetes, Golang)
    big_data_tools = ['airflow', 'kafka', 'kubernetes', 'flink', 'storm', 'beam', 'pulsar', 'druid', 'clickhouse', 'redpanda', 'spark streaming', 'terraform']

    try:
        
        pattern = r'\b(?:' + '|'.join(re.escape(skill) for skill in big_data_tools) + r')\b'
        matches = re.findall(pattern, cleaned_desc)
        matches = list(set(matches))
        if len(matches) == 0:
            job_post["big_data"] = None
        else:
            job_post["big_data"] = matches

    
    except:
        job_post["big_data"] = None

    # product skills
    product_skills = ['a/b testing', 'a/b', 'experimentation', 'causal inference', 'growth analytics', 'product analytics', 
                  'feature engineering', 'model deployment', 'mlops', 'explainability', 'ai ethics', 'privacy-preserving ml']

    try:
        
        pattern = r'\b(?:' + '|'.join(re.escape(skill) for skill in product_skills) + r')\b'
        matches = re.findall(pattern, cleaned_desc)
        matches = list(set(matches))
        if len(matches) == 0:
            job_post["product_skills"] = None
        else:
            job_post["product_skills"] = matches

    
    except:
        job_post["product_skills"] = None

    # Description
    try:
        job_post["description"] = cleaned_desc
        
    except:
        job_post["description"] = "No description found"
    

    job_list.append(job_post)

In [30]:
jobs_df = pd.DataFrame(job_list)
jobs_df


Unnamed: 0,company_name,location,title,seniority_level,employment_type,yoe,education,salary,programming_languages,ml_skills,data_processing_db,cloud,visualization,big_data,product_skills,description
0,favorited,"Santa Monica, CA","Data Scientist - T&S, Fraud and CX",Mid-Senior level,Full-time,3.0,No degree found,No salary found,"[python, sql, r]",,,,"[power bi, tableau]",,,apply to this position if you …are passiona...
1,Keeling Labs,"Los Angeles, CA",Machine Learning Engineer,Entry level,Full-time,3.0,phd,"$130,000.00/yr - $165,000.00/yr",[python],"[jax, reinforcement learning]",,"[aws, terraform]",,[terraform],,"our missionfounded in 2022, keeling labs wa..."
2,Los Angeles Dodgers,"Los Angeles, CA",Research Engineer,Associate,Full-time,3.0,bachelor's,"$120,000.00/yr - $130,000.00/yr","[python, sql, bash]",,[postgresql],[aws],,,,title: research engineerdepartment: basebal...
3,Passes,"Los Angeles, CA",Product Data Scientist,Entry level,Full-time,6.0,bachelor's,No salary found,"[python, r]",,"[redshift, etl, dbt, snowflake, spark]",[athena],,,"[a/b testing, a/b, product analytics, experime...",about passespasses is a leading platform de...
4,PRIMUS Global Technologies Pvt Ltd,"Los Angeles, CA",Clinical Data Scientist – 56099,Mid-Senior level,Full-time,,masters or bachelors,No salary found,"[r, matlab]",,,,,,,we have an immediate long-term op...
5,The Walt Disney Company,"Santa Monica, CA","Hulu Data Science Graduate Intern, Summer 2025",Mid-Senior level,Internship,,master's,No salary found,"[r, sql, python]",,,,,,"[causal inference, experimentation]",about the role & program:join the experimen...
6,tvScientific,"Los Angeles, CA",Senior Data Scientist,Mid-Senior level,Full-time,6.0,No degree found,"$162,021.00/yr - $189,000.00/yr","[python, scala]",,"[beam, spark]","[athena, aws]",,[beam],,job title: senior data scientist (ads)locat...
7,Disney Entertainment,"Santa Monica, CA","Data Science Grad Intern, Summer 2025",Mid-Senior level,Internship,,master’s,No salary found,"[python, sql]",[unsupervised learning],"[bigquery, airflow, etl]",[airflow],,[airflow],[feature engineering],about the role and programsupporting disney...
8,Suno,"Los Angeles, CA","Senior Data Scientist, Product",Mid-Senior level,Full-time,5.0,No degree found,"$170,000.00/yr - $230,000.00/yr","[python, sql]",,,,,,[experimentation],"about sunoat suno, we are building a future..."
9,Snap Inc.,"Los Angeles, CA","Machine Learning Engineer, Generative AI",Entry level,Full-time,,bachelor’s or master's or phd,"$100,000.00/yr - $176,000.00/yr","[python, c++]","[genai, pytorch]",,,,,,snap inc is a technology company....
