## **Job Scrapper - LINKEDIN**

In [None]:
import csv
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import urllib.parse

In [None]:
API_KEY = "f3a2132b-8986-4975-b0d8-47d1c186e0dc"

In [None]:
BASE_URL = 'https://ng.linkedin.com'

In [None]:
def get_url(position, location):
    template = BASE_URL + '/jobs/search?keywords={}&location={}&pageNumber=24&position=1'
    url = template.format(position, location)
    return url

In [None]:
def get_scrapeops_url(url):
    payload = {'api_key': API_KEY, 'url': url}
    proxy_url = 'https://proxy.scrapeops.io/v1/?' + urllib.parse.urlencode(payload)
    return proxy_url

In [None]:
url = get_scrapeops_url(get_url('data analyst', 'lagos'))

In [None]:
response = requests.get(url)
response 

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
cards = soup.find('ul', 'jobs-search__results-list').find_all('div', 'base-card')

In [None]:
len(cards)

In [None]:
card = cards[0]
card

In [None]:
job_url = card.find('a', 'base-card__full-link').get('href')
job_url

In [None]:
job_title = card.find('h3', 'base-search-card__title').text.strip()
job_title

In [None]:
company = card.find('h4', 'base-search-card__subtitle').text.strip()
company

In [None]:
location = card.find('span', 'job-search-card__location').text.strip().split(',')
location = (location[-2] + ', ' + location[-1]).strip()
location

In [None]:
post_date_str = card.time.text.strip()
post_date_str

In [None]:
post_date = card.time.get('datetime')
post_date

### Generalize model for a single record

In [None]:
def extract_data(card):
    """Extracts job record from a single record"""
    job_url = card.find('a', 'base-card__full-link').get('href')

    job_title = card.find('h3', 'base-search-card__title').text.strip()

    company = card.find('h4', 'base-search-card__subtitle').text.strip()

    location = card.find('span', 'job-search-card__location').text.strip().split(',')

    if len(location) > 1:
        location = (location[-2] + ', ' + location[-1]).strip()
    else: 
        location = location[0].strip()
        
    post_date_str = card.time.text.strip()
    post_date = card.time.get('datetime')

    today = datetime.today().strftime('%Y-%m-%d')

    record = (job_url, job_title, company, location, post_date, post_date_str, today)

    return record

In [None]:
extract_data(card)

In [None]:
records = []

for card in cards:
    record = extract_data(card)
    records.append(record)

records

### Putting it all together

In [149]:
import csv
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import urllib.parse

API_KEY = "f3a2132b-8986-4975-b0d8-47d1c186e0dc"

BASE_URL = 'https://ng.linkedin.com'

def get_scrapeops_url(url):
    payload = {'api_key': API_KEY, 'url': url}
    proxy_url = 'https://proxy.scrapeops.io/v1/?' + urllib.parse.urlencode(payload)
    return proxy_url

def get_url(position, location):
    template = BASE_URL + '/jobs/search?keywords={}&location={}&pageNumber=24&position=1'
    url = template.format(position, location)
    return url

def extract_data(card):
    """Extracts job record from a single record"""
    job_url = card.find('a', 'base-card__full-link').get('href')

    job_title = card.find('h3', 'base-search-card__title').text.strip()

    company = card.find('h4', 'base-search-card__subtitle').text.strip()

    location = card.find('span', 'job-search-card__location').text.strip().split(',')

    if len(location) > 1:
        location = (location[-2] + ', ' + location[-1]).strip()
    else: 
        location = location[0].strip()
        
    post_date_str = card.time.text.strip()
    post_date = card.time.get('datetime')

    today = datetime.today().strftime('%Y-%m-%d')

    record = (job_title, job_url, company, location, post_date, post_date_str, today)

    return record

def main(position, location):
    """Main function"""

    records = []
    count = 0

    url = get_url(position, location)
    proxy_url = get_scrapeops_url(url)

    try:
        r = requests.get(proxy_url)
    
        

        print("Rerieving job data from", url)

        if(r.status_code != 200):
            print("Error retrieving job data from ", url)

        soup = BeautifulSoup(r.text, 'html.parser')
        jobs = soup.find('ul', 'jobs-search__results-list').find_all('div', 'base-card')
        for job in jobs:
            count += 1

            record = extract_data(job)
            print("Retrieved record", count)
            print(record)
            records.append(record)

        with open('./data/job_data_linkedin.csv', 'a', newline='') as csvfile:
            fieldnames = ['job_title', 'job_url', 'company', 'location', 'post_date_str', 'post_date', 'today']
            writer = csv.writer(csvfile)
            writer.writerow(fieldnames)
            writer.writerows(records)
            
    except requests.exceptions.ConnectionError:
        print('Connection Error')
        quit


main('data analyst', 'lagos')

Rerieving job data from https://ng.linkedin.com/jobs/search?keywords=data analyst&location=lagos&pageNumber=24&position=1
Retrieved record 1
('Data Analyst (Remote) at GoMoney', 'https://ng.linkedin.com/jobs/view/data-analyst-remote-at-gomoney-at-gomoney-3759763199?refId=LTLisuHJeUsxRcgrRGJC3A%3D%3D&trackingId=a56LUbMPGHWa6ken5vmcKQ%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card', 'gomoney', 'Lagos State,  Nigeria', '2023-11-14', '2 weeks ago', '2023-12-04')
Retrieved record 2
('Data Analyst at Mopheth Nigeria Limited', 'https://ng.linkedin.com/jobs/view/data-analyst-at-mopheth-nigeria-limited-at-mopheth-nigeria-limited-3763893364?refId=LTLisuHJeUsxRcgrRGJC3A%3D%3D&trackingId=DgsouZJRRlIv1uM1TErFfg%3D%3D&position=2&pageNum=0&trk=public_jobs_jserp-result_search-card', 'Mopheth Nigeria Limited', 'Lagos State,  Nigeria', '2023-11-22', '1 week ago', '2023-12-04')
Retrieved record 3
('Data Analyst at BrandRegimen SFS Limited', 'https://ng.linkedin.com/jobs/view/data-ana

In [147]:
%reset -f