In [1]:
import requests
import json
import pandas as pd
import time
from bs4 import BeautifulSoup
import numpy as np

In [2]:
import os
os.chdir("SISTECH-MLOps-FinalProject-Group7")

# EdX Course

In [6]:
url = "https://igsyv1z1xi-dsn.algolia.net/1/indexes/*/queries"
headers = {
    "x-algolia-agent": "Algolia for JavaScript (5.0.0); Browser",
    "x-algolia-api-key": "6658746ce52e30dacfdd8ba5f8e8cf18",
    "x-algolia-application-id": "IGSYV1Z1XI",
    "content-type": "application/json"
}

payload = {
    "requests": [
        {
            "indexName": "product",
            "clickAnalytics": True,
            "facets": [
                "availability", "language", "learning_type", "level",
                "partner", "product", "program_type", "skills.skill", "subject"
            ],
            "hitsPerPage": 150,
            "page": 0,
            "filters": "",
            "query": ""
        }
    ]
}

all_courses = []
page = 0
product_type = ["course", "program"]

total_pages = None

for ptype in product_type:
    while True:
        print(f"[INFO] Fetching page {page} for product {ptype}...")
        payload["requests"][0]["page"] = page
        payload["requests"][0]["filters"] = f"product:{ptype}"
        response = requests.post(url, headers=headers, json=payload)
        data = response.json()

        hits = data["results"][0]["hits"]
        if total_pages is None:
            total_pages = data["results"][0].get("nbPages", 1)
            print(f"[INFO] Total pages available for product {ptype}: {total_pages}")

        if not hits:
            break

        for course in hits:
            record = {
                "title": course.get("title", "Missing"),
                "partner": course.get("partner", ["Missing"]),
                "primary_description": BeautifulSoup(course.get("primary_description", "Missing"), "html.parser").get_text(),
                "secondary_description": BeautifulSoup(course.get("secondary_description", "Missing"), "html.parser").get_text(),
                "tertiary_description": BeautifulSoup(course.get("tertiary_description", "Missing"), "html.parser").get_text(),
                "availability": course.get("availability", ["Missing"]),
                "subject": course.get("subject", ["Missing"]),
                "level": course.get("level", ["Missing"]),
                "language": course.get("language", ["Missing"]),
                "product": course.get("product", "Missing"),
                "program_type": course.get("program_type", ["Missing"]),
                "staff": course.get("staff", ["Missing"]),
                "translation_language": course.get("ai_languages", {}).get("translation_languages", ["Missing"]),
                "transcription_language": course.get("ai_languages", {}).get("transcription_languages", ["Missing"]),
                "recent_enrollment_count": course.get("recent_enrollment_count", "Missing"),
                "marketing_url": course.get("marketing_url", "Missing"),
                "weeks_to_complete": course.get("weeks_to_complete", "Missing"),
                "skill": (
                    [s["skill"] for s in course.get("skills", []) if isinstance(s, dict)]
                    if isinstance(course.get("skills", []), list) else ["Missing"]
                )
            }
            all_courses.append(record)

        page += 1
        if page >= total_pages:
            page = 0
            total_pages = None
            break

        print(f"[INFO] Sleeping 10 seconds to respect crawl delay...")
        time.sleep(10)


df = pd.DataFrame(all_courses)
df.to_csv("scrape_result/edx_courses.csv", index=False)
print("[DONE] Saved to edx_courses.csv")

[INFO] Fetching page 0 for product course...
[INFO] Total pages available for product course: 7
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 1 for product course...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 2 for product course...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 3 for product course...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 4 for product course...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 5 for product course...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 6 for product course...
[INFO] Fetching page 0 for product program...
[INFO] Total pages available for product program: 5


  "primary_description": BeautifulSoup(course.get("primary_description", "Missing"), "html.parser").get_text(),


[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 1 for product program...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 2 for product program...


  "tertiary_description": BeautifulSoup(course.get("tertiary_description", "Missing"), "html.parser").get_text(),


[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 3 for product program...
[INFO] Sleeping 10 seconds to respect crawl delay...
[INFO] Fetching page 4 for product program...
[DONE] Saved to edx_courses.csv


# Linkedin Jobs

In [None]:
location = "Indonesia"
exp_levels = range(1,7)
start="0"
id_list = []

for exp_level in exp_levels:
    for start in np.arange(0, 500, 10):
        list_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?location={location}&f_E={exp_level}&start={start}"
        response = requests.get(list_url)
        
        list_data = response.text
        list_soup = BeautifulSoup(list_data, "html.parser")
        page_jobs = list_soup.find_all("li")
        
        for job in page_jobs:
            base_card_div = job.find(class_= "base-card")
            job_id = base_card_div.get("data-entity-urn").split(":")[3]
            id_list.append(job_id)
        time.sleep(2)

In [17]:
# Try again so we can get 1000 jobs for each level
for exp_level in exp_levels:
    for start in np.arange(510, 1010, 10):
        list_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?location={location}&f_E={exp_level}&start={start}"
        response = requests.get(list_url)
        
        list_data = response.text
        list_soup = BeautifulSoup(list_data, "html.parser")
        page_jobs = list_soup.find_all("li")
        
        for job in page_jobs:
            base_card_div = job.find(class_= "base-card")
            job_id = base_card_div.get("data-entity-urn").split(":")[3]
            id_list.append(job_id)
        time.sleep(2)

In [24]:
len(id_list)

4510

In [None]:
# Initialize an empty list to store job information
job_list = []

# chunk by chunk to avoid rate limit issues
for job_id in id_list[4432:4510]:
    # Construct the URL for each job using the job ID
    job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
    }
    # Send a GET request to the job URL and parse the reponse
    job_response = requests.get(job_url, headers=headers)
    job_soup = BeautifulSoup(job_response.text, "html.parser")
    
     # Create a dictionary to store job details
    job_post = {}
    
    try:
        job_post["job_link"] = job_soup.find("a", {"class":"topcard__link"}).get('href').strip().split('?')[0]
    except:
        job_post["job_link"] = None
    
    # Try to extract and store the job title
    try:
        job_post["job_title"] = job_soup.find("h2", {"class":"top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title"}).text.strip()
    except:
        job_post["job_title"] = None
        
    # Try to extract and store the company name
    try:
        job_post["company_name"] = job_soup.find("a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
    except:
        job_post["company_name"] = None
    
    try:
        job_post["location"] = job_soup.find("span", {"class": "topcard__flavor topcard__flavor--bullet"}).text.strip()
    except:
        job_post["location"] = None
    
    try:
        uls = job_soup.select(".description__text.description__text--rich section ul")
        responsibilities = [li.text.strip() for li in uls[0].find_all("li")]
        job_post["responsibilities"] = responsibilities
    except:
        job_post["responsibilities"] = None
        
    try:
        uls = job_soup.select(".description__text.description__text--rich section ul")
        requirements = [li.text.strip() for li in uls[1].find_all("li")]
        job_post["requirements"] = requirements
    except:
        job_post["requirements"] = None
        
    try:
        additional_details_title = job_soup.find_all(class_="description__job-criteria-subheader")
        additional_details_content = job_soup.find_all(class_="description__job-criteria-text")
        for title in additional_details_title:
            key = None
            if title.text.strip() == "Seniority level":
                key = "level"
            elif title.text.strip() == "Employment type":
                key = "employment_type"
            elif title.text.strip() == "Job function":
                key = "job_function"
            elif title.text.strip() == "Industries":
                key = "industries"
            content_text = additional_details_content[additional_details_title.index(title)].text.strip()
            if key:
                job_post[key] = content_text
    except:
        pass
        
    # Try to extract and store the time posted
    try:
        job_post["time_posted"] = job_soup.find("span", {"class": "posted-time-ago__text topcard__flavor--metadata"}).text.strip()
    except:
        job_post["time_posted"] = None
        
    # Try to extract and store the number of applicants
    try:
        job_post["num_applicants"] = job_soup.find("span", {"class": "num-applicants__caption topcard__flavor--metadata topcard__flavor--bullet"}).text.strip()
    except:
        job_post["num_applicants"] = None
    
        
    # Append the job details to the job_list
    job_list.append(job_post)
    time.sleep(2)

In [61]:
job_df = pd.DataFrame(job_list)
job_df.to_csv("linkedin_jobs.csv", index=False)

In [62]:
job_df.head()

Unnamed: 0,job_link,job_title,company_name,location,responsibilities,requirements,level,employment_type,job_function,industries,time_posted,num_applicants
0,https://id.linkedin.com/jobs/view/general-affa...,General Affair Internship,Kalbe Nutritionals (PT Sanghiang Perkasa),"West Karawang, West Java, Indonesia","[Membuat konten kreatif (foto, video, caption)...","[Mahasiswa aktif dari jurusan Komunikasi, Mana...",Internship,Internship,"Other, Information Technology, and Management",Food and Beverage Services,1 month ago,
1,https://id.linkedin.com/jobs/view/data-analyst...,Data Analyst Intern,PT Lion Super Indo,"Jakarta, Indonesia",[Understand the day-to-day issues that our bus...,[Student of Bachelor degree in Statistics or A...,Internship,Internship,Information Technology and Business Development,Retail,1 week ago,
2,https://id.linkedin.com/jobs/view/project-mana...,Project Management Internship,Kalbe Nutritionals (PT Sanghiang Perkasa),"West Karawang, West Java, Indonesia",[Assist in compiling & updating the project ti...,[Willing to be placed in Kalbe Morinaga Cikamp...,Internship,Internship,Project Management and Information Technology,Food and Beverage Services,1 week ago,
3,https://id.linkedin.com/jobs/view/improvement-...,Improvement Campaign & Communication Intern,Kalbe Nutritionals (PT Sanghiang Perkasa),"West Karawang, West Java, Indonesia",[Melaksanakan observasi lapangan dan pencatata...,[Mendukung pelaksanaan event TPM dan campaign ...,Internship,Internship,Marketing and Sales,Food and Beverage Services,1 week ago,
4,https://id.linkedin.com/jobs/view/management-t...,Management Trainee,PT Astra International Tbk,"Jakarta, Jakarta, Indonesia","[Gelar sarjana dari jurusan apa pun, Lulusan b...",,Internship,Full-time,Education and Training,Automation Machinery Manufacturing,2 months ago,


In [64]:
job_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4510 entries, 0 to 4509
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   job_link          4498 non-null   object
 1   job_title         4498 non-null   object
 2   company_name      4484 non-null   object
 3   location          4498 non-null   object
 4   responsibilities  4149 non-null   object
 5   requirements      3567 non-null   object
 6   level             4498 non-null   object
 7   employment_type   4498 non-null   object
 8   job_function      4491 non-null   object
 9   industries        4472 non-null   object
 10  time_posted       4417 non-null   object
 11  num_applicants    1358 non-null   object
dtypes: object(12)
memory usage: 422.9+ KB
