# libraries

In [21]:
# Import all necessary libraries for scraping and data handling
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import time
from math import ceil

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

import numpy as np
import ast
import re
import os

"""
Requests is used to send HTTP requests and fetch static web pages.
BeautifulSoup is used to parse HTML content and extract the required data.

Selenium and its related modules (webdriver, Service, Options, By, and
ChromeDriverManager) are used to automate the browser and scrape dynamic
content rendered by JavaScript.

Pandas and NumPy are used for data manipulation, cleaning, and analysis
using DataFrames and numerical operations.

The time module is used to control delays between requests to ensure
pages load properly.

The math module (ceil) is used for rounding operations when needed.

The ast module is used to safely convert string representations into
Python objects such as lists or dictionaries.

Regular expressions (re) are used for text pattern matching and
text cleaning.

The os module is used to handle file system operations such as creating
directories, managing paths, and saving output files.
"""



'\nRequests is used to send HTTP requests and fetch static web pages.\nBeautifulSoup is used to parse HTML content and extract the required data.\n\nSelenium and its related modules (webdriver, Service, Options, By, and\nChromeDriverManager) are used to automate the browser and scrape dynamic\ncontent rendered by JavaScript.\n\nPandas and NumPy are used for data manipulation, cleaning, and analysis\nusing DataFrames and numerical operations.\n\nThe time module is used to control delays between requests to ensure\npages load properly.\n\nThe math module (ceil) is used for rounding operations when needed.\n\nThe ast module is used to safely convert string representations into\nPython objects such as lists or dictionaries.\n\nRegular expressions (re) are used for text pattern matching and\ntext cleaning.\n\nThe os module is used to handle file system operations such as creating\ndirectories, managing paths, and saving output files.\n'

# Dictionaries

In [22]:
# Predefined options for user input: countries and job titles
countries = {
    "1": "Egypt",
    "2": "United Arab Emirates",
    "3": "Saudi Arabia",
    "4": "Oman",
    "5": "United States",
    "6": "Belgium",
    "7": "Indonesia",
    "all": "all"
}

job_titles_dict = {
    "1": "data analyst",
    "2": "data scientist",
    "3": "business analyst",
    "4": "machine learning engineer",
    "5":"Engineering Construction Civil Architecture"
}

---

# User Input

## Country Input

In [23]:
# Select country 
print("Select country:")
for key, value in countries.items():
    print(f"{key}: {value}")


country_choice = input("Enter Country number from the list, or 'all' to select all countries : ").strip().lower()

# Default country is Egypt
if country_choice not in countries:
    country_choice = "1"


Select country:
1: Egypt
2: United Arab Emirates
3: Saudi Arabia
4: Oman
5: United States
6: Belgium
7: Indonesia
all: all


## Job Title Input

In [24]:
# Select job title 
print("\nSelect job title:")
for key, value in job_titles_dict.items():
    print(f"{key}: {value}")

job_choice = input("Enter number: ").strip()

# Default job title is 'data analyst'
if job_choice not in job_titles_dict:
    job_choice = "1"


Select job title:
1: data analyst
2: data scientist
3: business analyst
4: machine learning engineer
5: Engineering Construction Civil Architecture


In [25]:
country_selected = countries[country_choice]
job_selected = job_titles_dict[job_choice]

print(f"\nScraping '{job_selected}' in '{country_selected}'\n")


Scraping 'data analyst' in 'Egypt'



---

# Base URL

In [26]:
# Prepare search values for URL
job_query = job_selected.replace(" ", "%20")
country_query = country_selected.replace(" ", "%20")

print(f"Job query: {job_query}")
print(f"Country query: {country_query}")

Job query: data%20analyst
Country query: Egypt


In [27]:
# Prepare the URL for scraping based on user input
if country_selected.lower() == "all":
    base_url = f"https://wuzzuf.net/search/jobs/?a=hpb&q={job_query}"
    print(base_url)


else:
    base_url = f"https://wuzzuf.net/search/jobs/?a=hpb&filters%5Bcountry%5D%5B0%5D={country_query}&q={job_query}"
    print(base_url)
    

https://wuzzuf.net/search/jobs/?a=hpb&filters%5Bcountry%5D%5B0%5D=Egypt&q=data%20analyst


# Fetch and parse first page 

In [28]:
first_page = requests.get(base_url)
soup = BeautifulSoup(first_page.text, "html.parser")


# Get Total Pages

In [29]:
info = soup.find("li", class_="css-18k4nsw")
print(info)

<li class="css-18k4nsw">Showing 1 - 15 of 124</li>


In [30]:
# Detect total jobs and calculate number of pages
if not info:
    print("Couldn't detect total jobs. Assuming single page or empty.")
    total_pages = 1
    print(f"Total pages: {total_pages}\n")
else:
    try:
        total_jobs = int(info.text.strip().split()[-1])
        jobs_per_page = 15
        total_pages = ceil(total_jobs / jobs_per_page)
        print(f"Total jobs: {total_jobs}")
        print(f"Total pages: {total_pages}\n")
    except:
        total_jobs = 0
        total_pages = 1
        print(f"Total jobs: {total_jobs}")
        print(f"Total pages: {total_pages}\n")

Total jobs: 124
Total pages: 9



---

# Scrape Main Page

In [31]:
# Prepare lists to store scraped data
job_titles = []
job_links = []
company_names = []
work_modes=[]
locations = []
job_types = []
second_div_data_all = []

In [32]:
# Loop through each page and scrape job listings

for start in range(total_pages):
    print(f"Scraping page {start + 1} / {total_pages}")

    path = f"{base_url}&start={start}"
    page = requests.get(path)
    soup = BeautifulSoup(page.text, "html.parser")

    # ===== Job cards =====
    Job_cards = soup.find_all("div", class_="css-ghe2tq e1v1l3u10")
    if not Job_cards:
        break

    # ===== Containers for second div =====
    job_containers = soup.find_all("div", class_="css-1rhj4yg")

    for idx, job in enumerate(Job_cards):

        # -------- Job Title & Link --------
        a_tag = job.find("h2").find("a") if job.find("h2") else np.nan
        
        if a_tag:
            title = a_tag.get_text(strip=True)
            link = a_tag.get("href")
            
            if link and link.startswith("http"):
                full_link = link
            elif link:
                full_link = "https://wuzzuf.net" + link
            else:
                full_link = np.nan
        else:
            title = np.nan
            full_link = np.nan


        job_titles.append(title)
        job_links.append(full_link)

        # -------- Company --------
        company_tag = job.find("a", class_="css-ipsyv7")
        company_names.append(company_tag.get_text(strip=True) if company_tag else np.nan)

        # -------- Work Mode --------
        work_mode_tag = job.find("span", class_="css-uofntu eoyjyou0")
        work_modes.append(work_mode_tag.get_text(strip=True) if work_mode_tag else np.nan)

        # -------- Location --------
        loc_tag = job.find("span", class_="css-16x61xq")
        locations.append(loc_tag.get_text(strip=True) if loc_tag else np.nan)

        # -------- Job Type --------
        jt_tag = job.find("span", class_="css-uc9rga eoyjyou0")
        job_types.append(jt_tag.get_text(strip=True) if jt_tag else np.nan)

    
        # -------- Second div (a + span) --------
        second_div_items = []
        if idx < len(job_containers):
            divs = job_containers[idx].find_all("div")
            if len(divs) > 1:
                container_div = divs[1]

                for a in container_div.find_all("a"):
                    text = a.get_text(strip=True)
                    if text and text not in second_div_items:
                        second_div_items.append(text)

                for span in container_div.find_all("span", recursive=False):
                    if not span.find_parent("a"):
                        text = span.get_text(strip=True)
                        if text and text not in second_div_items:
                            second_div_items.append(text)

                for i, item in enumerate(second_div_items):
                    if "Yrs of Exp" in item:
                        second_div_items.insert(1, second_div_items.pop(i))
                        break

        second_div_data_all.append(second_div_items)
        

Scraping page 1 / 9
Scraping page 2 / 9
Scraping page 3 / 9
Scraping page 4 / 9
Scraping page 5 / 9
Scraping page 6 / 9
Scraping page 7 / 9
Scraping page 8 / 9
Scraping page 9 / 9


In [33]:
# Create DataFrame with all scraped data
df_main_page = pd.DataFrame({
    "Job Title": job_titles,
    "Company Name": company_names,
    "Location": locations,
    "Job Type": job_types,
    'work_modes':work_modes,
    'all_second_divs':second_div_data_all,
    "job_link": job_links
})

print("Total jobs scraped:", len(df_main_page))
df_main_page.head()

Total jobs scraped: 124


Unnamed: 0,Job Title,Company Name,Location,Job Type,work_modes,all_second_divs,job_link
0,Project Listing Specialist & Data Analyst for ...,armonia vita -,"Mansoura,Dakahlia,Egypt",Full Time,On-site,"[Entry Level, ·1 - 3 Yrs of Exp, ·Marketing/PR...",https://wuzzuf.net/jobs/p/8creyckjvc8z-project...
1,Business Data Analyst (Finance-Oriented),Mazzika Group -,"Dokki,Giza,Egypt",Full Time,On-site,"[Experienced, ·2 - 3 Yrs of Exp, ·Accounting/F...",https://wuzzuf.net/jobs/p/r377ssty41so-busines...
2,Data Analyst,Linrco Egypt -,"Nasr City,Cairo,Egypt",Full Time,On-site,"[Experienced, ·2 - 3 Yrs of Exp, ·Quality, Dat...",https://wuzzuf.net/jobs/p/ailkpyyoxt9a-data-an...
3,Commercial Specialist ( Junior Data Analyst ),realme -,"Nasr City,Cairo,Egypt",Full Time,On-site,"[Entry Level, ·1+ Yrs of Exp, ·Business Develo...",https://wuzzuf.net/jobs/p/btmdbqyuc8v8-commerc...
4,Financial planning and Analyst,Confidential -,"Cairo,Egypt",Full Time,On-site,"[Experienced, ·3 - 7 Yrs of Exp, ·Accounting/F...",https://wuzzuf.net/jobs/p/taf92lwlbjtm-financi...


---

# Scrape Job Details from Each Link 


## Collect Job Category & Description


In [34]:
# Prepare lists to store scraped data
job_descriptions = []
job_requirements = []

In [35]:
def get_job_data(job_links, wait_time=1):
    """
    Function to scrape job descriptions and requirements from a list of job URLs.
    
    Parameters:
    - job_links: list of job posting URLs to scrape.
    - wait_time: optional, time in seconds to wait between requests (default=1).
    
    Returns:
    - job_descriptions: list of job description texts.
    - job_requirements: list of job requirements texts.
    """
    
    # Loop through each job URL in the input list
    for full_link in job_links:
        desc_text = np.nan  #  variable to store job description
        req_text = np.nan   #  variable to store job requirements
        try:
            if full_link:
                job_page = requests.get(full_link)
                job_soup = BeautifulSoup(job_page.text, "html.parser")
                
                
                sections = job_soup.find_all("section", class_="css-5pnqc5")
                for section in sections:
                    h2 = section.find("h2")  
                    if not h2:
                        continue  

                    title = h2.get_text(strip=True)  
                    
                    # Check if the section is the job description
                    if title == "Job Description":
                        content_div = section.find("div")  
                        desc_text = content_div.get_text(" ", strip=True) if content_div else np.nan
                    
                    # Check if the section is the job requirements
                    elif title == "Job Requirements":
                        content_div = section.find("div")  
                        req_text = content_div.get_text(" ", strip=True) if content_div else np.nan

        except Exception as e:
            # Handle errors gracefully and print which URL caused the error
            print(f"Error scraping {full_link}: {e}")
        
        finally:
            # Append the scraped data (or None if missing) to the corresponding lists
            job_descriptions.append(desc_text)
            job_requirements.append(req_text)

    # Return the lists of job descriptions and requirements
    return job_descriptions, job_requirements


In [36]:
# Call the function
descriptions, requirements = get_job_data(job_links)

In [37]:
# Create DataFrame with all scraped data
df_des_req = pd.DataFrame({
    "job_link": job_links,
    "job_descriptions": descriptions,
    "job_requirements": requirements
})

print("Total jobs scraped:", len(df_des_req))
df_des_req.head()


Total jobs scraped: 124


Unnamed: 0,job_link,job_descriptions,job_requirements
0,https://wuzzuf.net/jobs/p/8creyckjvc8z-project...,"Create and upload accurate, SEO-optimized prod...","Create and upload accurate, SEO-optimized prod..."
1,https://wuzzuf.net/jobs/p/r377ssty41so-busines...,Job Summary: We are looking for a business-ori...,"Minimum 2 years of experience in a business, f..."
2,https://wuzzuf.net/jobs/p/ailkpyyoxt9a-data-an...,"Design, develop, and maintain KPIs related to ...",• Bachelor’s degree in Business Administration...
3,https://wuzzuf.net/jobs/p/btmdbqyuc8v8-commerc...,Job Responsibilities: Data Analytics and Repor...,"Bachelor’s degree required, preferably in fiel..."
4,https://wuzzuf.net/jobs/p/taf92lwlbjtm-financi...,"Tasks, Responsibilities and Duties Experienced...","Bachelor's degree in Finance, Accounting, Econ..."


---

## Collect Experience, Career Level, Education, Salary, Categories & Skills


In [38]:
# Prepare lists to store scraped data
Experience_Needed = []
Career_Level = []
Education_Level = []
Salary = []
Job_Categories = []
Skills_And_Tools = []

In [39]:
def scrape_job_details(urls, headless=True, wait_time=3):
    """
    Function to scrape job details from a list of URLs using Selenium.
    - If a link is missing, adds default None / [] values.
    - If some fields are missing in a link, adds available data and None / [] for missing fields.
    """

    # -------- Selenium setup --------
    options = Options()
    if headless:
        options.add_argument("--headless")  # run Chrome in headless mode
    options.add_argument("--disable-gpu")   # disable GPU usage
    options.add_argument("--no-sandbox")    # avoid sandbox issues

    # Create Chrome WebDriver
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )

    # -------- Loop through all URLs --------
    for url in urls:
        # -------- Default values for each URL --------
        experience_needed = np.nan
        career_level = np.nan
        education_level = np.nan
        salary_info = np.nan
        job_categories_list = []
        skills_and_tools_list = []

        # If the URL is empty → append default values and continue
        if not url:
            Experience_Needed.append(experience_needed)
            Career_Level.append(career_level)
            Education_Level.append(education_level)
            Salary.append(salary_info)
            Job_Categories.append(job_categories_list)
            Skills_And_Tools.append(skills_and_tools_list)
            continue

        try:
            # Open the page and wait for it to load
            driver.get(url)
            time.sleep(wait_time)

            try:
                # -------- Find main section containing job details --------
                section = driver.find_element(By.CSS_SELECTOR, "section.css-pbzohz")
                detail_divs = section.find_elements(By.CSS_SELECTOR, "div.css-1ajx53j")

                # -------- Loop through divs to extract key-value pairs --------
                for div in detail_divs:
                    try:
                        key = div.find_element(By.CSS_SELECTOR, "span.css-720fa0").text.strip()
                        value = div.find_element(By.CSS_SELECTOR, "span.css-iu2m7n").text.strip()

                        # Assign values based on key
                        if "Experience" in key:
                            experience_needed = value
                        elif "Career Level" in key:
                            career_level = value
                        elif "Education" in key:
                            education_level = value
                        elif "Salary" in key:
                            salary_info = value
                    except:
                        # If a field is missing → skip it
                        continue

                # ===== Job Categories =====
                try:
                    categories_elements = section.find_elements(By.CSS_SELECTOR, "div.css-1fwfib5 ul li span.css-1vi25m1")
                    job_categories_list = [c.text for c in categories_elements]
                except:
                    # If missing → keep empty list
                    job_categories_list = []

                # ===== Skills and Tools =====
                try:
                    skills_elements = section.find_elements(By.CSS_SELECTOR, "div.css-14zw0ku div.css-qe7mba span.css-1vi25m1")
                    skills_and_tools_list = [s.text for s in skills_elements]
                except:
                    # If missing → keep empty list
                    skills_and_tools_list = []

            except:
                # If main section is missing → all values remain default
                pass

        except Exception as e:
            # Print any errors encountered while loading the page
            print(f"Error scraping {url}: {e}")

        finally:
            # ===== Append values to global lists regardless of success =====
            Experience_Needed.append(experience_needed)
            Career_Level.append(career_level)
            Education_Level.append(education_level)
            Salary.append(salary_info)
            Job_Categories.append(job_categories_list)
            Skills_And_Tools.append(skills_and_tools_list)

    # Close the browser after finishing
    driver.quit()


In [40]:
# Call the function
scrape_job_details(job_links)

In [41]:
# Create DataFrame with all scraped data
df_details = pd.DataFrame({
    "Experience_Needed": Experience_Needed,
    "Career_Level": Career_Level,
    "Education_Level": Education_Level,
    "Salary": Salary,
    "Job_Categories": Job_Categories,
    "Skills_And_Tools": Skills_And_Tools,
    "job_link": job_links
})

print("Total jobs scraped:", len(df_details))
df_details.head()

Total jobs scraped: 124


Unnamed: 0,Experience_Needed,Career_Level,Education_Level,Salary,Job_Categories,Skills_And_Tools,job_link
0,1 To 3 Years,Entry Level (Junior Level / Fresh Grad),Bachelor'S Degree,Confidential,[Marketing/PR/Advertising],"[E-Commerce Platform Management, Data Analysis...",https://wuzzuf.net/jobs/p/8creyckjvc8z-project...
1,2 To 3 Years,Experienced (Non-Manager),Bachelor'S Degree,Confidential,"[Accounting/Finance, Business Development, Ana...","[Financial Analysis, Data Visualization, Busin...",https://wuzzuf.net/jobs/p/r377ssty41so-busines...
2,2 To 3 Years,Experienced (Non-Manager),Not Specified,Confidential,[Quality],"[Data Analysis, performance Analysis, KPI's, Q...",https://wuzzuf.net/jobs/p/ailkpyyoxt9a-data-an...
3,More Than 1 Year,Entry Level (Junior Level / Fresh Grad),Bachelor'S Degree,Confidential,"[Business Development, Analyst/Research]","[Statistics, Market Research, Research, Financ...",https://wuzzuf.net/jobs/p/btmdbqyuc8v8-commerc...
4,3 To 7 Years,Experienced (Non-Manager),Bachelor'S Degree,Confidential,"[Accounting/Finance, Analyst/Research]","[Accounting, Commerce, Analyst, Analysis, Fina...",https://wuzzuf.net/jobs/p/taf92lwlbjtm-financi...


---

# Prepare Clean & Reshape Data for Analysis


## Cleaning Main Page

In [42]:
# ----------------- Create a copy of the main page DataFrame -----------------
df_main_page_clean = df_main_page.copy()  
df_main_page_clean.head()

Unnamed: 0,Job Title,Company Name,Location,Job Type,work_modes,all_second_divs,job_link
0,Project Listing Specialist & Data Analyst for ...,armonia vita -,"Mansoura,Dakahlia,Egypt",Full Time,On-site,"[Entry Level, ·1 - 3 Yrs of Exp, ·Marketing/PR...",https://wuzzuf.net/jobs/p/8creyckjvc8z-project...
1,Business Data Analyst (Finance-Oriented),Mazzika Group -,"Dokki,Giza,Egypt",Full Time,On-site,"[Experienced, ·2 - 3 Yrs of Exp, ·Accounting/F...",https://wuzzuf.net/jobs/p/r377ssty41so-busines...
2,Data Analyst,Linrco Egypt -,"Nasr City,Cairo,Egypt",Full Time,On-site,"[Experienced, ·2 - 3 Yrs of Exp, ·Quality, Dat...",https://wuzzuf.net/jobs/p/ailkpyyoxt9a-data-an...
3,Commercial Specialist ( Junior Data Analyst ),realme -,"Nasr City,Cairo,Egypt",Full Time,On-site,"[Entry Level, ·1+ Yrs of Exp, ·Business Develo...",https://wuzzuf.net/jobs/p/btmdbqyuc8v8-commerc...
4,Financial planning and Analyst,Confidential -,"Cairo,Egypt",Full Time,On-site,"[Experienced, ·3 - 7 Yrs of Exp, ·Accounting/F...",https://wuzzuf.net/jobs/p/taf92lwlbjtm-financi...


In [43]:
# ----------------- Convert "all_second_divs" string representations of lists to actual lists -----------------
df_main_page_clean["all_second_divs"] = df_main_page_clean["all_second_divs"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [44]:
# ----------------- Function to split all_second_divs into structured columns -----------------
def split_all_second_divs(x):
    """
    Input: list x from 'all_second_divs'
    Output: Series with Job Level, Experience Needed, and remaining Job Details
    
    Logic:
    - First element -> Job Level
    - Second element (if contains 'Yrs of Exp') -> Experience Needed
    - Remaining elements -> Job Details list
    """
    # Job Level
    job_level = x[0] if len(x) > 0 else np.nan

    # Experience Needed
    # Check if second element exists and contains 'Yrs of Exp'
    if len(x) > 1 and "Yrs of Exp" in x[1]:
        experience_needed = x[1]
        # Remaining elements after second element -> Job Details
        job_details = x[2:] if len(x) > 2 else []
    else:
        experience_needed = np.nan
        # If second element is not 'Yrs of Exp', all remaining elements -> Job Details
        job_details = x[1:] if len(x) > 1 else []

    return pd.Series([job_level, experience_needed, job_details])

In [45]:
# ----------------- Apply the splitting function -----------------
df_main_page_clean[["Job Level", "Experience Needed", "Job Details"]] = df_main_page_clean["all_second_divs"].apply(split_all_second_divs)


In [46]:

# ----------------- Clean 'Experience Needed' -----------------
df_main_page_clean["Experience Needed"] = df_main_page_clean["Experience Needed"].str.replace("·", "", regex=False)

# ----------------- Clean 'Company Name' -----------------
df_main_page_clean["Company Name"] = df_main_page_clean["Company Name"].str.rstrip().str.rstrip("-") 

# ----------------- Clean 'Job Details' -----------------
#df_main_page_clean["Job Details"] = df_main_page_clean["Job Details"].apply(lambda lst: [s.replace("·", "") for s in lst])


In [47]:
# ----------------- Drop unnecessary columns -----------------
df_main_page_clean.drop(columns=["all_second_divs", "Job Details"], inplace=True)

# ----------------- Preview cleaned DataFrame -----------------
df_main_page_clean.head()


Unnamed: 0,Job Title,Company Name,Location,Job Type,work_modes,job_link,Job Level,Experience Needed
0,Project Listing Specialist & Data Analyst for ...,armonia vita,"Mansoura,Dakahlia,Egypt",Full Time,On-site,https://wuzzuf.net/jobs/p/8creyckjvc8z-project...,Entry Level,1 - 3 Yrs of Exp
1,Business Data Analyst (Finance-Oriented),Mazzika Group,"Dokki,Giza,Egypt",Full Time,On-site,https://wuzzuf.net/jobs/p/r377ssty41so-busines...,Experienced,2 - 3 Yrs of Exp
2,Data Analyst,Linrco Egypt,"Nasr City,Cairo,Egypt",Full Time,On-site,https://wuzzuf.net/jobs/p/ailkpyyoxt9a-data-an...,Experienced,2 - 3 Yrs of Exp
3,Commercial Specialist ( Junior Data Analyst ),realme,"Nasr City,Cairo,Egypt",Full Time,On-site,https://wuzzuf.net/jobs/p/btmdbqyuc8v8-commerc...,Entry Level,1+ Yrs of Exp
4,Financial planning and Analyst,Confidential,"Cairo,Egypt",Full Time,On-site,https://wuzzuf.net/jobs/p/taf92lwlbjtm-financi...,Experienced,3 - 7 Yrs of Exp


## Create & Clean Skills DataFrame 

In [48]:
# Create a new DataFrame with only 'Skills_And_Tools' and 'job_links'
df_Skills_clean = df_details[['Skills_And_Tools', 'job_link']].copy()

# Create a new DataFrame with only 'Job_Categories' and 'job_links'
df_Cat_clean = df_details[['Job_Categories', 'job_link']].copy()


In [49]:
df_Skills_clean.head(2)


Unnamed: 0,Skills_And_Tools,job_link
0,"[E-Commerce Platform Management, Data Analysis...",https://wuzzuf.net/jobs/p/8creyckjvc8z-project...
1,"[Financial Analysis, Data Visualization, Busin...",https://wuzzuf.net/jobs/p/r377ssty41so-busines...


In [50]:
df_Cat_clean.head(2)

Unnamed: 0,Job_Categories,job_link
0,[Marketing/PR/Advertising],https://wuzzuf.net/jobs/p/8creyckjvc8z-project...
1,"[Accounting/Finance, Business Development, Ana...",https://wuzzuf.net/jobs/p/r377ssty41so-busines...


In [51]:
def split_and_clean(df, column_name):
    """
    Split a column by commas (ignoring commas inside parentheses),
    explode to multiple rows, strip spaces, remove unwanted characters,
    and remove empty rows.

    Parameters:
        df (pd.DataFrame): Input DataFrame
        column_name (str): Name of the column to clean and split

    Returns:
        pd.DataFrame: Cleaned DataFrame with one item per row
    """
    
    df_clean = df[[column_name, 'job_link']].copy()

    # Function to split by comma but ignore commas inside parentheses
    def split_items(text):
        pattern = r',\s*(?![^(]*\))'
        return re.split(pattern, text)
    
    # Apply splitting
    df_clean[column_name] = (
        df_clean[column_name]
        .fillna('')
        .astype(str)
        .apply(split_items)
    )
    
    # Explode to get one item per row
    df_clean = df_clean.explode(column_name)
    
    # Strip spaces and remove unwanted characters
    df_clean[column_name] = (
        df_clean[column_name]
        .str.replace(r"[\[\]']", "", regex=True)  # remove [, ], '
        .str.strip()
    )
    
    # Remove empty rows
    df_clean = df_clean[df_clean[column_name] != '']
    df_clean.reset_index(drop=True, inplace=True)
    
    return df_clean


In [52]:
# Call the function
df_Skills_clean = split_and_clean(df_details, 'Skills_And_Tools')
print(df_Skills_clean.head())


                 Skills_And_Tools  \
0  E-Commerce Platform Management   
1                   Data Analysis   
2                 Microsoft Excel   
3                   sheets google   
4                             SEO   

                                            job_link  
0  https://wuzzuf.net/jobs/p/8creyckjvc8z-project...  
1  https://wuzzuf.net/jobs/p/8creyckjvc8z-project...  
2  https://wuzzuf.net/jobs/p/8creyckjvc8z-project...  
3  https://wuzzuf.net/jobs/p/8creyckjvc8z-project...  
4  https://wuzzuf.net/jobs/p/8creyckjvc8z-project...  


In [53]:
# Call the function
df_Cat_clean = split_and_clean(df_details, 'Job_Categories')
print(df_Cat_clean.head())


             Job_Categories                                           job_link
0  Marketing/PR/Advertising  https://wuzzuf.net/jobs/p/8creyckjvc8z-project...
1        Accounting/Finance  https://wuzzuf.net/jobs/p/r377ssty41so-busines...
2      Business Development  https://wuzzuf.net/jobs/p/r377ssty41so-busines...
3          Analyst/Research  https://wuzzuf.net/jobs/p/r377ssty41so-busines...
4                   Quality  https://wuzzuf.net/jobs/p/ailkpyyoxt9a-data-an...


In [54]:
df_details_clean=df_details.copy()
df_details_clean.drop(columns=["Job_Categories", "Skills_And_Tools"], inplace=True)
df_details_clean

Unnamed: 0,Experience_Needed,Career_Level,Education_Level,Salary,job_link
0,1 To 3 Years,Entry Level (Junior Level / Fresh Grad),Bachelor'S Degree,Confidential,https://wuzzuf.net/jobs/p/8creyckjvc8z-project...
1,2 To 3 Years,Experienced (Non-Manager),Bachelor'S Degree,Confidential,https://wuzzuf.net/jobs/p/r377ssty41so-busines...
2,2 To 3 Years,Experienced (Non-Manager),Not Specified,Confidential,https://wuzzuf.net/jobs/p/ailkpyyoxt9a-data-an...
3,More Than 1 Year,Entry Level (Junior Level / Fresh Grad),Bachelor'S Degree,Confidential,https://wuzzuf.net/jobs/p/btmdbqyuc8v8-commerc...
4,3 To 7 Years,Experienced (Non-Manager),Bachelor'S Degree,Confidential,https://wuzzuf.net/jobs/p/taf92lwlbjtm-financi...
...,...,...,...,...,...
119,3 To 5 Years,Experienced (Non-Manager),Not Specified,Confidential,https://wuzzuf.net/jobs/p/bwcxadoallmg-rd-form...
120,3 To 6 Years,Manager,Bachelor'S Degree,15000 To 30000 EGP Per Month,https://wuzzuf.net/jobs/p/wruicly6zelr-senior-...
121,8 To 12 Years,Manager,Not Specified,Confidential,https://wuzzuf.net/jobs/p/cxs5lmimsm6f-chief-a...
122,2 To 5 Years,Experienced (Non-Manager),Bachelor'S Degree,Confidential,https://wuzzuf.net/jobs/p/vcmure4mgiky-ai-depl...


# Save Data

In [None]:
def save_df_to_csv(
    df,
    df_name,  # <-- New parameter for the DataFrame's name
    job_selected,
    country_selected,
    stage,
    save_path=r"D:data/output"
):
    os.makedirs(save_path, exist_ok=True)

    base_name = (
        f"{df_name}_"
        f"{job_selected.replace(' ', '_')}_"
        f"{country_selected.replace(' ', '_')}_{stage}.csv"
    )

    # Get all files similar to this base name
    existing_files = [
        f for f in os.listdir(save_path)
        if f.endswith(base_name)
    ]

    # Extract numbers to increment
    numbers = []
    for f in existing_files:
        match = re.match(r"(\d+)_", f)
        if match:
            numbers.append(int(match.group(1)))

    next_number = max(numbers) + 1 if numbers else 1

    file_name = f"{next_number}_{base_name}"
    full_path = os.path.join(save_path, file_name)

    df.to_csv(full_path, index=False, encoding="utf-8-sig")
    print(f"Saved: {file_name}")


In [56]:

save_df_to_csv(df_main_page, "Main_Page", job_selected, country_selected, "raw")
save_df_to_csv(df_details, "Job_Details", job_selected, country_selected, "raw")
save_df_to_csv(df_des_req, "Description_Requirements", job_selected, country_selected, "raw")

save_df_to_csv(df_main_page_clean, "Main_Page", job_selected, country_selected, "clean")
save_df_to_csv(df_details_clean, "Job_Details", job_selected, country_selected, "clean")
save_df_to_csv(df_Skills_clean, "Skills_clean", job_selected, country_selected, "clean")
save_df_to_csv(df_Cat_clean, "Categories", job_selected, country_selected, "clean")



Saved: 1_Main_Page_data_analyst_Egypt_raw.csv
Saved: 1_Job_Details_data_analyst_Egypt_raw.csv
Saved: 1_Description_Requirements_data_analyst_Egypt_raw.csv
Saved: 1_Main_Page_data_analyst_Egypt_clean.csv
Saved: 1_Job_Details_data_analyst_Egypt_clean.csv
Saved: 1_Skills_clean_data_analyst_Egypt_clean.csv
Saved: 1_Categories_data_analyst_Egypt_clean.csv
