In [55]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import datetime
import time
import re # Import the re module for regular expressions


In [56]:
def clean_text(text):
    if not text:
        return ""
    text = ' '.join(text.split())  # Remove extra whitespace
    return text.strip()

In [61]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def scrape_jobs(url):
    base_url_first,base_url_scend=url
    page = 1
    data=[]
    while True:
        url = f"{base_url_first}{page}{base_url_scend}"
        print(url)
        r = requests.get(url, headers=headers)
        print(r.status_code)

        if r.status_code != 200:
            break
            
        soup = BeautifulSoup(r.text, 'html.parser')
        job_list=soup.find('ul', class_='job-list')
        jobs = job_list.find_all('li', class_='post-id')
        print(len(jobs))
        if not jobs:
            break

              
        for job in jobs:
                try:
                    job_id = job.get('id', '')
                    
                    title_elem = job.find('a', class_='titreJob')
                    job_title = title_elem.get_text(strip=True) if title_elem else ''
                    job_url = title_elem['href'] if title_elem else ''
                    
                    company_img = job.find('img', class_='photo')
                    company_name = company_img['title'] if company_img else ''
                    
                    description = ''
                    requirements = ''
                    for info in job.find_all('div', class_='info'):
                        icon = info.find('i')
                        if not icon:
                            continue
                        if 'fa-industry' in icon.get('class', []):
                            description = clean_text(info.get_text(strip=True))
                        elif 'fa-binoculars' in icon.get('class', []):
                            requirements = clean_text(info.get_text(strip=True))
                    #print('1',requirements,'2',description)
                    
                    date_info = job.find('em', class_='date')
                    posted_date = ''
                    end_date = ''
                    if date_info:
                        date_text = date_info.get_text()
                        date_parts = [part.strip() for part in date_text.split('|')[0].split() if part.strip()]
                        
                        if 'du' in date_parts:
                            du_index = date_parts.index('du')
                            if du_index + 1 < len(date_parts):
                                posted_date = date_parts[du_index + 1]
                        
                        if 'au' in date_parts:
                            au_index = date_parts.index('au')
                            if au_index + 1 < len(date_parts):
                                end_date = date_parts[au_index + 1]
                    
                    sector = []
                    for li in job.find_all('li'):
                        if 'Secteur' in li.get_text():
                            sector = [a.get_text(strip=True) for a in li.find_all('a')]
                            break
                    
                    contract_type = ''
                    remote_work = ''
                    for li in job.find_all('li'):
                        text = li.get_text()
                        if 'Type de contrat' in text:
                            contract_type = li.find('a').get_text(strip=True) if li.find('a') else ''
                            # Extract remote work info if present
                            if 'Télétravail' in text:
                                remote_parts = text.split('Télétravail :')
                                if len(remote_parts) > 1:
                                    remote_work = remote_parts[1].strip()
                            break
                    
                    job_data = {
                        'id': job_id,
                        'title': job_title,
                        'company_name': company_name,
                        'url': job_url,
                        'company_description': description,
                        'requirements': requirements,
                        'posted_date': posted_date,
                        'end_date': end_date,
                        'sector': sector,
                        'contract_type': contract_type,
                        'remote_work': remote_work,
                       
                    }
                    data.append(job_data)
                    
                except Exception as e:
                    print(f"Error processing job {job_id}: {str(e)}")
                    continue
               

                
        print(f"Page {page} scraped with {len(jobs)} jobs")
        #print(job_data)
            
        
        
            
        time.sleep(15)  
        page += 1
    return data 

In [63]:


def scrape_jobs_old(url_parts):
   
    base_url_first, base_url_second = url_parts
    page = 1
    all_job_listings = []

    while True:
        
        current_url = f"{base_url_first}{page}{base_url_second}"
        print(f"Attempting to fetch: {current_url}")

        try:
            r = requests.get(current_url, headers=headers, timeout=30) # Add a timeout for robustness
            print(f"Status Code for page {page}: {r.status_code}")

            if r.status_code != 200:
                print(f"Non-200 status code ({r.status_code}) received, breaking loop.")
                break # Exit the loop if the request was not successful

            soup = BeautifulSoup(r.text, 'html.parser')

            # Find the main job list container
            job_list_ul = soup.find('ul', class_='job-list job-list2')

            if not job_list_ul:
                print(f"No job list container found on page {page}, breaking loop.")
                break # Exit if no job list is found (e.g., end of pages)

            # Iterate through each job list item (li)
            page_job_count = 0
            jobs_elem = job_list_ul.find_all('li')

            for job_li in jobs_elem:
                job = {}

                title_tag = job_li.find('h2')
                if title_tag:
                    anchor_tag = title_tag.find('a')
                    if anchor_tag:
                        job['title'] = anchor_tag.get_text(strip=True)
                        job['url'] = anchor_tag['href'] if anchor_tag['href'].startswith('http') else f"http://www.rekrute.com{anchor_tag['href']}"

                date_info = job_li.find('em', class_='date')
                if date_info:
                    spans = date_info.find_all('span')
                    if len(spans) >= 2:
                        job['posted_date'] = spans[0].get_text(strip=True)
                        job['end_date'] = spans[1].get_text(strip=True)

                    num_positions_text_full = date_info.get_text(separator=' ', strip=True)
                    match = re.search(r'Postes proposés:\s*(\d+)', num_positions_text_full)
                    if match:
                        job['number_of_positions'] = match.group(1).strip()
                    else:
                        job['number_of_positions'] = ''


                info_divs = job_li.find_all('div', class_='info')

                for info_div in info_divs:
                    icon_tag = info_div.find('i')
                    icon_class = ''
                    if icon_tag and 'class' in icon_tag.attrs:
                        icon_class = ' '.join(icon_tag['class'])

                    if "fa-info-circle" in icon_class: 
                        sector_li = info_div.find('li', string=lambda text: text and "Secteur d'activité" in text)
                        if sector_li:
                            sector_link = sector_li.find('a')
                            if sector_link:
                                job['sector'] = sector_link.get_text(strip=True)

                        function_li = info_div.find('li', string=lambda text: text and "Fonction" in text)
                        if function_li:
                            functions = [a.get_text(strip=True) for a in function_li.find_all('a')]
                            if functions:
                                job['function'] = ', '.join(functions)

                    elif "fa-industry" in icon_class: # This div contains the company description
                        company_description_span = info_div.find('span')
                        if company_description_span:
                            job['company_description'] = company_description_span.get_text(strip=True)

                    elif "fa-binoculars" in icon_class: # This div contains what we'll now call 'requirements'
                        requirements_span = info_div.find('span')
                        if requirements_span:
                            job['requirements'] = requirements_span.get_text(strip=True)

                if job: # Check if the 'job' dictionary has any data
                    all_job_listings.append(job)
                page_job_count += 1

            if page_job_count == 0:
                print(f"No jobs found on page {page}, breaking loop.")
                break # Break if a page returns successfully but contains no job listings

        except requests.exceptions.RequestException as e:
            print(f"Request error on page {page}: {e}")
            break # Break on network or request-related errors
        except Exception as e:
            print(f"An unexpected error occurred on page {page}: {e}")
            break # Catch any other unexpected errors

        time.sleep(15) # Wait for 15 seconds to avoid overloading the server
        page += 1 # Move to the next page

    return all_job_listings

In [59]:
urls=[ ("https://www.rekrute.com/offres.html?p=","&s=3&o=1&sectorId%5B0%5D=24"),
    ("https://web.archive.org/web/20210226004856/https://www.rekrute.com/offres.html?p=","&s=3&o=1&sectorId%5B0%5D=24"),]

urls_old=[
    ("https://web.archive.org/web/20181109202217/http://www.rekrute.com:80/offres.html?s=3&p=","&o=1&sectorId[0]=24"),
    ("https://web.archive.org/web/20180906114435/http://www.rekrute.com:80/offres.html?s=3&p=","&o=1&sectorId[0]=24"),
    ("https://web.archive.org/web/20180706175502/http://www.rekrute.com:80/offres.html?s=3&p=","&o=1&sectorId[0]=24"),
    ("https://web.archive.org/web/20180506132020/http://www.rekrute.com:80/offres.html?s=3&p=","&o=1&sectorId[0]=24"),
    ("https://web.archive.org/web/20180306235847/http://www.rekrute.com:80/offres.html?s=3&p=","&o=1&sectorId[0]=24"),
    ("https://web.archive.org/web/20171229180112/http://www.rekrute.com:80/offres.html?s=3&p=","&o=1&sectorId[0]=24"),
    ("https://web.archive.org/web/20171228054159/http://www.rekrute.com:80/offres.html?s=3&p=","&o=1&sectorId[0]=24")
]


In [64]:
data1=[]
data2=[]
for url in urls:
    time.sleep(10)  
    data1+=scrape_jobs(url)
for url in urls_old:
    time.sleep(10)  
    data2+=scrape_jobs_old(url)


https://www.rekrute.com/offres.html?p=1&s=3&o=1&sectorId%5B0%5D=24
200
50
Error processing job 172122: 'title'
Error processing job 172119: 'title'
Error processing job 172116: 'title'
Page 1 scraped with 50 jobs
https://www.rekrute.com/offres.html?p=2&s=3&o=1&sectorId%5B0%5D=24
200
50
Error processing job 172113: 'title'
Error processing job 171898: 'title'
Page 2 scraped with 50 jobs
https://www.rekrute.com/offres.html?p=3&s=3&o=1&sectorId%5B0%5D=24
200
50
Error processing job 171089: 'title'
Error processing job 170634: 'title'
Page 3 scraped with 50 jobs
https://www.rekrute.com/offres.html?p=4&s=3&o=1&sectorId%5B0%5D=24
200
50
Page 4 scraped with 50 jobs
https://www.rekrute.com/offres.html?p=5&s=3&o=1&sectorId%5B0%5D=24
200
29
Page 5 scraped with 29 jobs
https://www.rekrute.com/offres.html?p=6&s=3&o=1&sectorId%5B0%5D=24
200
0
https://web.archive.org/web/20210226004856/https://www.rekrute.com/offres.html?p=1&s=3&o=1&sectorId%5B0%5D=24
200
50
Error processing job 121613: 'title'
Erro

In [65]:
df1 = pd.DataFrame(data1)
df1.describe()

Unnamed: 0,id,title,company_name,url,company_description,requirements,posted_date,end_date,sector,contract_type,remote_work
count,451,451.0,451,451.0,451,451,451,451,451,451,451
unique,286,229.0,59,407.0,81,269,49,63,9,6,4
top,171420,,Sofrecom Maroc,,Vous êtes en quête de nouveaux challenges et v...,about the role Vos principales missions seront...,22/05/2025,09/07/2025,[Informatique],CDI,Hybride
freq,2,45.0,103,45.0,103,6,35,35,358,408,320


In [66]:
df2 = pd.DataFrame(data2)
df2.describe()

Unnamed: 0,title,url,posted_date,end_date,number_of_positions,company_description,requirements
count,1623,1623,1623,1623,1623,1623,1623
unique,475,495,59,66,15,118,445
top,Concepteur Développeur (H/F) | Casablanca (Maroc),http://www.rekrute.com/web/20250605110347/http...,22/05/2025,09/07/2025,1,Vous êtes en quête de nouveaux challenges et v...,about the role\n\n\tVos principales missions s...
freq,28,7,126,98,929,350,21


In [74]:
df1.head()

Unnamed: 0,id,title,company_name,url,company_description,requirements,posted_date,end_date,sector,contract_type,remote_work
0,172312,Ingénieur Full-Stack | Casablanca (Maroc),Deloitte Extended Services,/offre-emploi-ingenieur-full-stack-recrutement...,«Tous nos postes sont ouverts au télétravail.»...,Quel sera votre rôle dans la #TeamDeloitte ? D...,10/06/2025,10/08/2025,[Informatique],CDI,Hybride
1,172303,Test Manager Anglophone (H/F) | Rabat (Maroc),AXA Services Maroc,/offre-emploi-test-manager-anglophone-hf-recru...,AXA Services Maroc est le centre d'expertise e...,"Rattaché(e) au Responsable testing, le Test Ma...",10/06/2025,10/08/2025,[Informatique],CDI,Hybride
2,172286,Digital Campaign Production Manager | Casablan...,Orange Business,/offre-emploi-digital-campaign-production-mana...,Orange Business est l’entité du Groupe Orange ...,"About the role Within the ""Campaign Production...",10/06/2025,10/08/2025,"[Informatique, Telecom]",CDI,Hybride
3,170818,Click & Webmaster | Casablanca (Maroc),Orange Business,/offre-emploi-click-webmaster-recrutement-oran...,Orange Business est l’entité du Groupe Orange ...,"As a Webmaster & Digital analytics specialist,...",10/06/2025,21/06/2025,"[Informatique, Telecom]",CDI,Hybride
4,170817,Click User Support Manager | Casablanca (Maroc),Orange Business,/offre-emploi-click-user-support-manager-recru...,Orange Business est l’entité du Groupe Orange ...,Role is with Orange Wholesale Division -Orange...,10/06/2025,21/06/2025,"[Informatique, Telecom]",CDI,Hybride


In [73]:
df2.head()

Unnamed: 0,title,url,posted_date,end_date,number_of_positions,company_description,requirements
0,QA Software Architect - $5000/month - Remote W...,http://www.rekrute.com/web/20181109202217/http...,09/11/2018,09/01/2019,10,Who is Crossover? \n\nCrossover connects talen...,We're running an Online Hiring Event this comi...
1,L3 Product Support Engineer - $8300/month - Re...,http://www.rekrute.com/web/20181109202217/http...,09/11/2018,09/01/2019,10,Who is Crossover? \n\nCrossover connects talen...,We're running an Online Hiring Event this comi...
2,L2 Customer Support Architect - $5000/month - ...,http://www.rekrute.com/web/20181109202217/http...,09/11/2018,09/01/2019,10,Who is Crossover? \n\nCrossover connects talen...,We're running an Online Hiring Event this comi...
3,L1 Customer Support Engineer - $2500/month - R...,http://www.rekrute.com/web/20181109202217/http...,09/11/2018,09/01/2019,10,Who is Crossover? \n\nCrossover connects tal...,We're running an Online Hiring Event this comi...
4,Financial Analyst (English) - $2500/month - Re...,http://www.rekrute.com/web/20181109202217/http...,09/11/2018,09/01/2019,10,Who is Crossover? \n\nCrossover connects talen...,We're running an Online Hiring Event this comi...


In [75]:
df1.to_csv('rekrutedatasete1.csv', index=False)



In [76]:
df2.to_csv('rekrutedatasete2.csv', index=False)