In [None]:
from bs4 import BeautifulSoup
from urllib.parse import quote_plus
import pandas as pd
import time
import random
import requests


In [None]:
def scrape_coursera_courses(user_query, max_courses=200):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    }

    encoded_query = quote_plus(user_query)
    courses = []
    page = 1
    per_page = 20

    while len(courses) < max_courses:
        start_index = (page - 1) * per_page
        url = f"https://www.coursera.org/search?query={encoded_query}&start={start_index}&sortBy=BEST_MATCH"

        try:
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')

            course_links = soup.find_all("a", class_="cds-119 cds-113 cds-115 cds-CommonCard-titleLink css-vflzcf cds-142")
            if not course_links:
                break

            for link_tag in course_links:
                if len(courses) >= max_courses:
                    break

                # Title & Link
                title_tag = link_tag.find("h3")
                title = title_tag.get_text(strip=True) if title_tag else None
                href = link_tag.get("href", "")
                full_link = f"https://www.coursera.org{href}" if href.startswith("/") else href

                # Card Container
                card = link_tag.find_parent("li") or link_tag.find_parent("div", class_="cds-ProductCard") or link_tag

                # Organization
                organization = None
                org_container = card.find("p", class_="cds-ProductCard-partnerNames")
                if org_container:
                    organization = org_container.get_text(strip=True)

                # Rating
                rating = None
                rating_tag = card.find("div", {"aria-label": "Rating"})
                if rating_tag:
                    rating = rating_tag.get("aria-valuenow")

                # Metadata
                metadata_text = []
                metadata_div = card.find("div", class_="cds-CommonCard-metadata")
                if metadata_div:
                    p_tags = metadata_div.find_all("p", class_="css-vac8rf")
                    metadata_text = [p.get_text(strip=True) for p in p_tags]
                metadata = ", ".join(metadata_text) if metadata_text else None

                # Skills
                skills = None
                skill_tag = card.find("div", class_="cds-CommonCard-bodyContent")
                if skill_tag:
                    p_tag = skill_tag.find("p", class_="css-vac8rf")
                    if p_tag:
                        skills = p_tag.get_text(strip=True).replace("Skills you'll gain:", "")

                courses.append({
                    "title": title,
                    "link": full_link,
                    "organization": organization,
                    "rating": rating,
                    "metadata": metadata,
                    "skills": skills
                })

            page += 1
            time.sleep(random.uniform(1, 2))

        except Exception as e:
            print(f"Error on page {page}: {e}")
            break

    print(f"Scraping completed: {len(courses)} courses found.")
    return pd.DataFrame(courses)



In [5]:
job_list = [
    "Data Analyst", "Backend Developer", "QA Tester",
    "Social Media Strategist", "Public Relations", "Tax Consultant",
    "Copywriter", "Financial Analyst", "Auditor",
    "HR Generalist", "Career Counselor", "User Researcher"
]

skill_list = [
    "Python", "Artificial Intelligence", "Excel",
    "Machine Learning", "SQL", "Project Management",
    "Power BI", "Marketing",'java', 'javascript','math',
    'cybersecurity', 'cloud computing', 'acounting', 'ios', 'android', 
    'web development', 'graphic design', 'hr', 'statistics', 'networking',
    'network', 'oop', 'social media', 'seo', 'presentation skills', 'communication skills'
]

# Simpan semua hasil scraping ke dalam satu list
all_dfs = []

# Scrape berdasarkan job title
for job in job_list:
    print(f"Scraping for job: {job}")
    df_job = scrape_coursera_courses(job, max_courses=150)
    df_job['Source'] = job  # menandai asal pencarian
    all_dfs.append(df_job)
    time.sleep(random.uniform(1, 2))  # anti-banned

# Scrape berdasarkan skill
for skill in skill_list:
    print(f"Scraping for skill: {skill}")
    df_skill = scrape_coursera_courses(skill, max_courses=150)
    df_skill['Source'] = skill
    all_dfs.append(df_skill)
    time.sleep(random.uniform(1, 2))

# Gabungkan semua dataframe dan hapus duplikat berdasarkan URL
df_all_course = pd.concat(all_dfs, ignore_index=True)

df_all_course.drop_duplicates(subset=["title", "link"], inplace=True)


Scraping for job: Data Analyst
Scraping completed: 150 courses found.
Scraping for job: Backend Developer
Scraping completed: 150 courses found.
Scraping for job: QA Tester
Scraping completed: 150 courses found.
Scraping for job: Social Media Strategist
Scraping completed: 150 courses found.
Scraping for job: Public Relations
Scraping completed: 150 courses found.
Scraping for job: Tax Consultant
Scraping completed: 150 courses found.
Scraping for job: Copywriter
Scraping completed: 150 courses found.
Scraping for job: Financial Analyst
Scraping completed: 150 courses found.
Scraping for job: Auditor
Scraping completed: 150 courses found.
Scraping for job: HR Generalist
Scraping completed: 150 courses found.
Scraping for job: Career Counselor
Scraping completed: 150 courses found.
Scraping for job: User Researcher
Scraping completed: 150 courses found.
Scraping for skill: Python
Scraping completed: 150 courses found.
Scraping for skill: Artificial Intelligence
Scraping completed: 150 c

In [6]:
print("Total unique courses scraped:", len(df_all_course))

Total unique courses scraped: 445


In [7]:
df_all_course

Unnamed: 0,title,link,organization,rating,metadata,skills,Source
0,IBM Data Analyst,https://www.coursera.org/professional-certific...,IBM,4.6,Beginner · Professional Certificate · 3 - 6 Mo...,"Skills you'll gain:Data Storytelling, Dashboar...",Data Analyst
1,Meta Data Analyst,https://www.coursera.org/professional-certific...,Meta,4.7,Beginner · Professional Certificate · 3 - 6 Mo...,"Skills you'll gain:Data Storytelling, Business...",Data Analyst
2,Google Data Analytics,https://www.coursera.org/professional-certific...,Google,4.8,Beginner · Professional Certificate · 3 - 6 Mo...,"Skills you'll gain:Data Storytelling, Rmarkdow...",Data Analyst
3,Excel Basics for Data Analysis,https://www.coursera.org/learn/excel-basics-da...,IBM,4.8,Beginner · Course · 1 - 3 Months,"Skills you'll gain:Excel Formulas, Microsoft E...",Data Analyst
4,Microsoft Power BI Data Analyst,https://www.coursera.org/professional-certific...,Microsoft,4.6,Beginner · Professional Certificate · 3 - 6 Mo...,"Skills you'll gain:Power BI, Microsoft Excel, ...",Data Analyst
...,...,...,...,...,...,...,...
5704,Developing Interpersonal Skills,https://www.coursera.org/learn/interpersonal-s...,IBM,4.7,Beginner · Course · 1 - 4 Weeks,"Skills you'll gain:Active Listening, Interpers...",communication skills
5706,Storytelling and influencing: Communicate with...,https://www.coursera.org/learn/communicate-wit...,Macquarie University,4.8,Mixed · Course · 1 - 3 Months,"Skills you'll gain:Overcoming Objections, Infl...",communication skills
5708,Communication Skills for Engineers,https://www.coursera.org/specializations/leade...,Rice University,4.7,Beginner · Specialization · 3 - 6 Months,"Skills you'll gain:Proposal Writing, Oral Expr...",communication skills
5710,Improve Communication with Genial.ly,https://www.coursera.org/projects/improve-comm...,Coursera Project Network,4.7,Beginner · Guided Project · Less Than 2 Hours,"Skills you'll gain:Train The Trainer, Virtual ...",communication skills


In [None]:
# Pisahkan berdasarkan tanda pemisah ' · '
metadata_split = df_all_course['metadata'].str.split(' · ', expand=True)

# Buat kolom baru dari hasil pemisahan
df_all_course['difficulty'] = metadata_split[0]
df_all_course['duration'] = metadata_split[2] 
df_all_course.drop(columns=['metadata'], inplace=True)

# Hapus "Skills you'll gain:" dari kolom 'skills'
df_all_course['skills'] = df_all_course['skills'].str.replace("Skills you'll gain:", "", regex=False)

# (Opsional) Hilangkan spasi berlebih di awal/akhir
df_all_course['skills'] = df_all_course['skills'].str.strip()

In [None]:
df_all_course.drop(columns=['Source'], inplace=True)
df_all_course

In [3]:
import pandas as pd
data = pd.read_csv('coursera_courses.csv')
data.drop(columns=['Source'], inplace=True)
data

Unnamed: 0,title,link,organization,rating,skills,difficulty,duration
0,IBM Data Analyst,https://www.coursera.org/professional-certific...,IBM,4.6,"Data Storytelling, Dashboard, Data Visualizati...",Beginner,3 - 6 Months
1,Meta Data Analyst,https://www.coursera.org/professional-certific...,Meta,4.7,"Data Storytelling, Business Metrics, Key Perfo...",Beginner,3 - 6 Months
2,Google Data Analytics,https://www.coursera.org/professional-certific...,Google,4.8,"Data Storytelling, Rmarkdown, Data Literacy, D...",Beginner,3 - 6 Months
3,Excel Basics for Data Analysis,https://www.coursera.org/learn/excel-basics-da...,IBM,4.8,"Excel Formulas, Microsoft Excel, Data Cleansin...",Beginner,1 - 3 Months
4,Microsoft Power BI Data Analyst,https://www.coursera.org/professional-certific...,Microsoft,4.6,"Power BI, Microsoft Excel, Data Analysis, Data...",Beginner,3 - 6 Months
...,...,...,...,...,...,...,...
440,Developing Interpersonal Skills,https://www.coursera.org/learn/interpersonal-s...,IBM,4.7,"Active Listening, Interpersonal Communications...",Beginner,1 - 4 Weeks
441,Storytelling and influencing: Communicate with...,https://www.coursera.org/learn/communicate-wit...,Macquarie University,4.8,"Overcoming Objections, Influencing, Persuasive...",Mixed,1 - 3 Months
442,Communication Skills for Engineers,https://www.coursera.org/specializations/leade...,Rice University,4.7,"Proposal Writing, Oral Expression, Presentatio...",Beginner,3 - 6 Months
443,Improve Communication with Genial.ly,https://www.coursera.org/projects/improve-comm...,Coursera Project Network,4.7,"Train The Trainer, Virtual Environment, Techni...",Beginner,Less Than 2 Hours


In [4]:
data.to_csv("coursera_courses.csv", index=False)

In [9]:
import pandas as pd
data = pd.read_csv('job_title_des.csv')
data["Job Title"].value_counts()


Job Title
Backend Developer          169
JavaScript Developer       166
Java Developer             161
Node js developer          160
Software Engineer          160
iOS Developer              159
PHP Developer              156
Flutter Developer          155
DevOps Engineer            155
Machine Learning           152
Django Developer           152
Network Administrator      145
Database Administrator     139
Full Stack Developer       138
Wordpress Developer        132
Copywriter                  31
Social Media Strategist     28
User Researcher             27
Public Relations            24
HR Generalist               22
Tax Consultant              21
Data Analyst                19
Financial Analyst           18
Auditor                     15
QA Tester                   12
Career Counselor            11
Name: count, dtype: int64