In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Search parameters
title = "Data Analyst"
location = "Turkey"
start_list = [0, 12, 22, 32, 42, 52, 62, 72, 82, 92, 102, 112, 122, 132, 142, 152]

# User-Agent that acts like a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
}

# Collect job postings from all pages
all_page_jobs = []

for start in start_list:
    list_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={title}&location={location}&start={start}"
    try:
        response = requests.get(list_url, headers=headers)
        list_soup = BeautifulSoup(response.text, "html.parser")
        page_jobs = list_soup.find_all("li")
        all_page_jobs.extend(page_jobs)
    except Exception as e:
        print(f"Sayfa {start} çekilirken hata: {e}")
    time.sleep(random.uniform(1.5, 3))  # Bot korumasını aşmak için bekle

# Gather Job IDs
id_list = []

for job in all_page_jobs:
    base_card_div = job.find("div", {"class": "base-card"})
    if base_card_div:
        try:
            job_id = base_card_div.get("data-entity-urn").split(":")[3]
            id_list.append(job_id)
        except:
            continue

# Gather posting details
job_list = []

for job_id in id_list:
    job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"

    # Retry 3 times
    for attempt in range(3):
        job_response = requests.get(job_url, headers=headers)
        if job_response.status_code == 200 and "topcard__title" in job_response.text:
            break
        time.sleep(2)

    job_soup = BeautifulSoup(job_response.text, "html.parser")
    job_post = {"job_id": job_id, "url": f"https://www.linkedin.com/jobs/view/{job_id}/"}

    # Job Title
    try:
        job_post["job_title"] = job_soup.find("h2", class_="top-card-layout__title").text.strip()
    except:
        job_post["job_title"] = None

    # Company Name
    try:
        job_post["company_name"] = job_soup.find("a", class_="topcard__org-name-link").text.strip()
    except:
        job_post["company_name"] = None

    # Location
    try:
        job_post["location"] = job_soup.find("span", class_="topcard__flavor--bullet").text.strip()
    except:
        job_post["location"] = None

    # Date Posted
    try:
        job_post["day_posted"] = job_soup.find("span", class_="posted-time-ago__text").text.strip()
    except:
        job_post["day_posted"] = None

    # Number of Applicants
    try:
        job_post["num_applicants"] = job_soup.find("figcaption", class_="num-applicants__caption").text.strip()
    except:
        job_post["num_applicants"] = None

    # Description
    try:
        desc_div = job_soup.find("div", class_="show-more-less-html__markup")
        job_post["description"] = desc_div.get_text(separator="\n").strip() if desc_div else None
    except:
        job_post["description"] = None

    job_list.append(job_post)
    time.sleep(random.uniform(1.5, 3))  # Wait between the postings

# Export it to DataFrame
jobs_df = pd.DataFrame(job_list)


In [24]:
print(jobs_df[jobs_df["job_title"].isna()])

Empty DataFrame
Columns: [job_id, url, job_title, company_name, location, day_posted, num_applicants, description]
Index: []


In [25]:
jobs_df

Unnamed: 0,job_id,url,job_title,company_name,location,day_posted,num_applicants,description
0,4246219468,https://www.linkedin.com/jobs/view/4246219468/,İş Zekası ve Raporlama Kıdemli Uzmanı,KoçSistem,"Istanbul, Türkiye",2 weeks ago,Over 200 applicants,Türkiye’nin köklü ve lider bilgi teknolojileri...
1,4202827901,https://www.linkedin.com/jobs/view/4202827901/,Data Analyst,Migros One,"Istanbul, Türkiye",2 weeks ago,Over 200 applicants,"Who are we?\nOur business is technology, food ..."
2,4236348478,https://www.linkedin.com/jobs/view/4236348478/,Planlama ve Raporlama Uzmanı - Müşteri Deneyimi,sahibinden.com,"Istanbul, Türkiye",3 weeks ago,Over 200 applicants,sahibinden.com Dünyası'nda Biz;\n'Vay be!' ded...
3,4217289819,https://www.linkedin.com/jobs/view/4217289819/,Müşteri Deneyimi Analitiği/Araştırmaları Yetki...,Yapı Kredi,"Istanbul, Türkiye",1 month ago,Over 200 applicants,"Yapı Kredi olarak 1944’ten beri ilklerin, yeni..."
4,4244368488,https://www.linkedin.com/jobs/view/4244368488/,Analitik Model Proje ve Koordinasyon Uzmanı,Yapı Kredi,"Istanbul, Türkiye",2 weeks ago,Over 200 applicants,"Yapı Kredi olarak 1944’ten beri ilklerin, yeni..."
...,...,...,...,...,...,...,...,...
155,4229178894,https://www.linkedin.com/jobs/view/4229178894/,İki Dilli Editör,DataAnnotation,"İzmir, Türkiye",1 week ago,Over 200 applicants,"DataAnnotation, kaliteli yapay zeka geliştirme..."
156,4249602317,https://www.linkedin.com/jobs/view/4249602317/,AI Engineer,Argela Technologies,"Ankara, Türkiye",1 week ago,Over 200 applicants,Argela’s mission isn’t simply to provide solut...
157,4246235116,https://www.linkedin.com/jobs/view/4246235116/,Senior/Expert Data Scientist,dataSpecta,Türkiye,2 weeks ago,Over 200 applicants,Senior/Expert Data Scientist\nAbout the job:\n...
158,4239008250,https://www.linkedin.com/jobs/view/4239008250/,Yazılım Analisti,Aselsan,"Ankara, Türkiye",3 weeks ago,Over 200 applicants,ASELSAN Savunma Sistem Teknolojileri (SST) Sek...


In [13]:
tools = [
    "Python", "R", "SQL", "Power BI", "Excel", "Tableau", "Qlik",
    "Snowflake", "Looker", "Google Sheets", "SAP",
    "DAX", "ETL", "Hadoop", "Spark", "BigQuery", "ML", "Machine Learning",
    "Apache", "Airflow", "AWS", "Redshift", "PySpark", "Azure", "GCP", "Kafka",
    "dbt", "Docker", "Kubernetes", "NoSQL", "MongoDB", "Google Cloud"
]

In [None]:
import re

# Prepare desc_lower column
jobs_df["desc_lower"] = jobs_df["description"].fillna("").str.lower()

# Start all tool columns with false
for tool in tools:
    tool_col = tool.replace(" ", "_")

    if tool.lower() == "power bi":
        jobs_df[tool_col] = jobs_df["desc_lower"].apply(lambda x: bool(re.search(r"\bpower[\s\-]?bi\b", x)))

    elif tool == "R":
        jobs_df[tool] = jobs_df["desc_lower"].apply(lambda x: bool(re.search(r"\br\b", x)))

    elif tool == "SAP":
        jobs_df[tool] = jobs_df["desc_lower"].apply(lambda x: bool(re.search(r"\bsap\b", x)))

    # ML and Machine Learning are combined in one column (will not be processed separately)
    elif tool in ["ML", "Machine Learning"]:
        continue  # We'll take these two out of the loop

    else:
        jobs_df[tool_col] = jobs_df["desc_lower"].apply(lambda x: tool.lower() in x)

# 🔁 Combine ML and Machine Learning to process as a single column
jobs_df["ML"] = jobs_df["desc_lower"].apply(lambda x: bool(re.search(r"\bml\b", x)) or "machine learning" in x)

# Remove temporary column
jobs_df.drop(columns=["desc_lower"], inplace=True)



In [15]:
jobs_df

Unnamed: 0,job_id,url,job_title,company_name,location,day_posted,num_applicants,description,Python,R,...,Azure,GCP,Kafka,dbt,Docker,Kubernetes,NoSQL,MongoDB,Google_Cloud,ML
0,4246219468,https://www.linkedin.com/jobs/view/4246219468/,İş Zekası ve Raporlama Kıdemli Uzmanı,KoçSistem,"Istanbul, Türkiye",2 weeks ago,Over 200 applicants,Türkiye’nin köklü ve lider bilgi teknolojileri...,False,False,...,True,False,False,False,False,False,False,False,False,False
1,4202827901,https://www.linkedin.com/jobs/view/4202827901/,Data Analyst,Migros One,"Istanbul, Türkiye",2 weeks ago,Over 200 applicants,"Who are we?\nOur business is technology, food ...",True,False,...,False,False,False,False,False,False,False,False,False,False
2,4236348478,https://www.linkedin.com/jobs/view/4236348478/,Planlama ve Raporlama Uzmanı - Müşteri Deneyimi,sahibinden.com,"Istanbul, Türkiye",3 weeks ago,Over 200 applicants,sahibinden.com Dünyası'nda Biz;\n'Vay be!' ded...,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4244368488,https://www.linkedin.com/jobs/view/4244368488/,Analitik Model Proje ve Koordinasyon Uzmanı,Yapı Kredi,"Istanbul, Türkiye",2 weeks ago,Over 200 applicants,"Yapı Kredi olarak 1944’ten beri ilklerin, yeni...",True,True,...,False,False,False,False,False,False,False,False,False,False
4,4251184405,https://www.linkedin.com/jobs/view/4251184405/,Growth Data Analyst,Trendyol Group,"Istanbul, Türkiye",1 week ago,Over 200 applicants,Ready to learn more about us?\nWe were founded...,True,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,4161496260,https://www.linkedin.com/jobs/view/4161496260/,"Data Analyst (Bangkok Based, relocation provided)",Agoda,"İstanbul, Türkiye",1 week ago,,About Agoda\nAgoda is an online travel booking...,True,True,...,False,False,False,False,False,False,False,False,False,False
156,4250895473,https://www.linkedin.com/jobs/view/4250895473/,Business Intelligence (BI) Developer – Power B...,dataSpecta,"Istanbul, Türkiye",1 week ago,Over 200 applicants,Business Intelligence (BI) Developer – Power B...,False,False,...,True,False,False,False,False,False,False,False,False,False
157,4221137395,https://www.linkedin.com/jobs/view/4221137395/,Data Scientist,Peak,"Istanbul, Türkiye",3 weeks ago,Over 200 applicants,We are looking for a Data Scientist who is pas...,True,False,...,False,False,False,True,False,False,False,False,False,True
158,4255681476,https://www.linkedin.com/jobs/view/4255681476/,Expert Data Engineer - Hybrid,Pegasus Airlines,"Istanbul, Türkiye",15 hours ago,,We are Pegasus Airlines!\nWe are Pegasus. We r...,True,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
relevant_keywords = [
    # English
    "analyst", "analytics", "data", "business intelligence", "insight", "reporting",
    # Turkish
    "veri", "analist", "raporlama", "iş zekası", "analizi", "gösterge paneli", "dashboard"
]


In [17]:
jobs_df["job_title_lower"] = jobs_df["job_title"].fillna("").str.lower()

jobs_df["relevant_by_title"] = jobs_df["job_title_lower"].apply(
    lambda x: any(keyword in x for keyword in relevant_keywords)
)


In [None]:
# Remove general tools like Excel from tool_columns
general_tools = ["Excel", "Google_Sheets"]
tool_columns = [col for col in jobs_df.columns
                if col not in [
                    'job_id', 'job_title', 'company_name', 'location',
                    'day_posted', 'num_applicants', 'description', 'url',
                    'job_title_lower', 'relevant_by_title', 'relevant_by_tools',
                    'tool_count', 'relevant'
                ] + general_tools]


In [None]:
# New tool count and filter
jobs_df["tool_count"] = jobs_df[tool_columns].sum(axis=1)
jobs_df["relevant_by_tools"] = jobs_df["tool_count"] > 0


In [20]:
jobs_df["relevant"] = jobs_df["relevant_by_title"] & jobs_df["relevant_by_tools"]
relevant_jobs = jobs_df[jobs_df["relevant"] == True].copy()
relevant_jobs = relevant_jobs.reset_index(drop=True)


In [21]:
relevant_jobs

Unnamed: 0,job_id,url,job_title,company_name,location,day_posted,num_applicants,description,Python,R,...,Kubernetes,NoSQL,MongoDB,Google_Cloud,ML,job_title_lower,relevant_by_title,tool_count,relevant_by_tools,relevant
0,4246219468,https://www.linkedin.com/jobs/view/4246219468/,İş Zekası ve Raporlama Kıdemli Uzmanı,KoçSistem,"Istanbul, Türkiye",2 weeks ago,Over 200 applicants,Türkiye’nin köklü ve lider bilgi teknolojileri...,False,False,...,False,False,False,False,False,i̇ş zekası ve raporlama kıdemli uzmanı,True,5,True,True
1,4202827901,https://www.linkedin.com/jobs/view/4202827901/,Data Analyst,Migros One,"Istanbul, Türkiye",2 weeks ago,Over 200 applicants,"Who are we?\nOur business is technology, food ...",True,False,...,False,False,False,False,False,data analyst,True,5,True,True
2,4236348478,https://www.linkedin.com/jobs/view/4236348478/,Planlama ve Raporlama Uzmanı - Müşteri Deneyimi,sahibinden.com,"Istanbul, Türkiye",3 weeks ago,Over 200 applicants,sahibinden.com Dünyası'nda Biz;\n'Vay be!' ded...,False,False,...,False,False,False,False,False,planlama ve raporlama uzmanı - müşteri deneyimi,True,3,True,True
3,4251184405,https://www.linkedin.com/jobs/view/4251184405/,Growth Data Analyst,Trendyol Group,"Istanbul, Türkiye",1 week ago,Over 200 applicants,Ready to learn more about us?\nWe were founded...,True,True,...,False,False,False,False,False,growth data analyst,True,6,True,True
4,4240888594,https://www.linkedin.com/jobs/view/4240888594/,Business Analytics Member,Allianz,"Istanbul, Türkiye",2 weeks ago,Over 200 applicants,"""Would you like to take your place in Allianz,...",False,False,...,False,False,False,False,False,business analytics member,True,1,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,4225142687,https://www.linkedin.com/jobs/view/4225142687/,Data Engineer,Sipay,"Istanbul, Türkiye",1 month ago,Over 200 applicants,"Sipay, Electronic Money and Payment Services I...",False,False,...,True,False,False,False,False,data engineer,True,6,True,True
109,4161496260,https://www.linkedin.com/jobs/view/4161496260/,"Data Analyst (Bangkok Based, relocation provided)",Agoda,"İstanbul, Türkiye",1 week ago,,About Agoda\nAgoda is an online travel booking...,True,True,...,False,False,False,False,False,"data analyst (bangkok based, relocation provided)",True,4,True,True
110,4250895473,https://www.linkedin.com/jobs/view/4250895473/,Business Intelligence (BI) Developer – Power B...,dataSpecta,"Istanbul, Türkiye",1 week ago,Over 200 applicants,Business Intelligence (BI) Developer – Power B...,False,False,...,False,False,False,False,False,business intelligence (bi) developer – power b...,True,9,True,True
111,4221137395,https://www.linkedin.com/jobs/view/4221137395/,Data Scientist,Peak,"Istanbul, Türkiye",3 weeks ago,Over 200 applicants,We are looking for a Data Scientist who is pas...,True,False,...,False,False,False,False,True,data scientist,True,9,True,True


In [None]:
import unicodedata

# Function to help normalize
def normalize(text):
    text = str(text)
    text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("utf-8")
    return text.lower()

cities = ["Istanbul", "Bursa", "Antalya", "Kocaeli", "Ankara"]

relevant_jobs["location"] = relevant_jobs["location"].apply(
    lambda x: next(
        (city for city in cities if city.lower() in normalize(x)), x
    )
)


In [None]:
relevant_jobs

Unnamed: 0,job_id,url,job_title,company_name,location,day_posted,num_applicants,description,Python,R,...,Azure,GCP,Kafka,dbt,Docker,Kubernetes,NoSQL,MongoDB,Google_Cloud,ML
0,4246219468,https://www.linkedin.com/jobs/view/4246219468/,İş Zekası ve Raporlama Kıdemli Uzmanı,KoçSistem,Istanbul,2 weeks ago,Over 200 applicants,Türkiye’nin köklü ve lider bilgi teknolojileri...,False,False,...,True,False,False,False,False,False,False,False,False,False
1,4202827901,https://www.linkedin.com/jobs/view/4202827901/,Data Analyst,Migros One,Istanbul,2 weeks ago,Over 200 applicants,"Who are we?\nOur business is technology, food ...",True,False,...,False,False,False,False,False,False,False,False,False,False
2,4236348478,https://www.linkedin.com/jobs/view/4236348478/,Planlama ve Raporlama Uzmanı - Müşteri Deneyimi,sahibinden.com,Istanbul,3 weeks ago,Over 200 applicants,sahibinden.com Dünyası'nda Biz;\n'Vay be!' ded...,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4251184405,https://www.linkedin.com/jobs/view/4251184405/,Growth Data Analyst,Trendyol Group,Istanbul,1 week ago,Over 200 applicants,Ready to learn more about us?\nWe were founded...,True,True,...,False,False,False,False,False,False,False,False,False,False
4,4240888594,https://www.linkedin.com/jobs/view/4240888594/,Business Analytics Member,Allianz,Istanbul,2 weeks ago,Over 200 applicants,"""Would you like to take your place in Allianz,...",False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,4225142687,https://www.linkedin.com/jobs/view/4225142687/,Data Engineer,Sipay,Istanbul,1 month ago,Over 200 applicants,"Sipay, Electronic Money and Payment Services I...",False,False,...,False,False,True,False,True,True,False,False,False,False
109,4161496260,https://www.linkedin.com/jobs/view/4161496260/,"Data Analyst (Bangkok Based, relocation provided)",Agoda,Istanbul,1 week ago,,About Agoda\nAgoda is an online travel booking...,True,True,...,False,False,False,False,False,False,False,False,False,False
110,4250895473,https://www.linkedin.com/jobs/view/4250895473/,Business Intelligence (BI) Developer – Power B...,dataSpecta,Istanbul,1 week ago,Over 200 applicants,Business Intelligence (BI) Developer – Power B...,False,False,...,True,False,False,False,False,False,False,False,False,False
111,4221137395,https://www.linkedin.com/jobs/view/4221137395/,Data Scientist,Peak,Istanbul,3 weeks ago,Over 200 applicants,We are looking for a Data Scientist who is pas...,True,False,...,False,False,False,True,False,False,False,False,False,True


In [None]:
relevant_jobs["num_applicants"] = relevant_jobs["num_applicants"].replace({
    "Over 200 applicants": "200+",
    "Be among the first 25 applicants": "<25"
})


In [None]:
from datetime import date, timedelta

relevant_jobs["date"] = date.today() - timedelta(days=1)

In [None]:
# More inclusive keywords (less likely to mislead but meaningful words)
regex_patterns = [
    r"\bdata analyst\b",
    r"\bdata analysis\b",
    r"\bbusiness analyst\b",
    r"\bbusiness intelligence\b",
    r"\bbi analyst\b",
    r"\bbi developer\b",
    r"\banalytics specialist\b",
    r"\banalytics engineer\b",
    r"\bdata insights?\b",
    r"\binsight analyst\b",
    r"\breporting (specialist|analyst|assistant)\b",
    r"\bdata & analytics\b",
    r"\bdata and analytics\b",
    r"\bveri analitiği\b",
    r"\bveri analisti\b",
    r"\bveri yöneticisi\b",
    r"\braporlama uzmanı\b",
    r"\biş zekası\b",
    r"\banaliz ve raporlama\b",
    r"\bdata analytics\b",
]

In [None]:
def match_bi_or_analytics(title):
    title = str(title).lower()
    for pattern in regex_patterns:
        if re.search(pattern, title):
            return True
    return False

# Apply
relevant_jobs["is_bi_or_analytics"] = relevant_jobs["job_title"].apply(match_bi_or_analytics)

# Filtered data
filtered_jobs = relevant_jobs[relevant_jobs["is_bi_or_analytics"]].copy()
filtered_jobs.reset_index(drop=True, inplace=True)


In [6]:
filtered_jobs.to_csv("filtered_linkedin_jobs.csv", index=False, encoding="utf-8-sig")