# LIBRARIES IMPORTS

In [None]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests

# VARIABLE DECLARATIONS

In [40]:
# Define global variables
global URL, HEADERS

URL = "https://realpython.github.io/fake-jobs/"
HEADERS = {"User-Agent": "Mozilla/5.0"}
JOBS = []

# DATA REQUEST

In [41]:
def fetch_data():
  global URL, HEADERS

  # DATA FETCHING
  response = requests.get(URL, headers=HEADERS)
  if response.status_code == 200:
    return BeautifulSoup(response.content, "html.parser")
  else:
    return {"error": "Failed to fetch data"}

# EXTRACTIONS

In [42]:
def data_extraction(data):
  # DATA PARSING
  jobs_list = data.find_all("div", class_="column is-half")
  data_xl = []
  
  # DATA EXTRACTION
  for job in jobs_list:
    data_xs = {
      "job_title": job.find("h2", class_="title is-5").text.strip(),
      "company_name": job.find("h3", class_="subtitle is-6 company").text.strip(),
      "location": job.find("p", class_="location").text.strip(),
      "date_posted": job.find("time").text.strip(),
      # "link": job.find_all("a", "card-footer-item")[1].get("href").strip()
    }
    data_xl.append(data_xs)

  # print(data_xl)
  print("Data fetched successfully")
  return pd.DataFrame(data_xl)

# TRANSFORMATIONS

In [43]:
def data_transformation(data):
  # CONVERT TIME TO DATETIME
  data["date_posted"] = pd.to_datetime(data["date_posted"])

  # SPLIT LOCATION INTO CITY AND STATE
  data[["location", "location_state"]] = data["location"].str.split(", ", expand=True)

  # PARSE DATE INTO DAY/MONTH AND YEAR COLUMNS
  data["year"] = data["date_posted"].dt.year # GET DATE YEAR
  data["date_posted"] = data["date_posted"].dt.strftime("%A, %d %B")

  # REARRANGE COLUMNS
  # data = data.iloc[:, [0, 1, 2, 5, 3, 6, 4]] # job_title(0), company_name(1), location(2), date_posted(3), link(4), location_state(5), year(6)
  # 
  # if Link is excluded
  data = data.iloc[:, [0, 1, 2, 4, 3, 5]] # job_title(0), company_name(1), location(2), date_posted(3), location_state(4), year(5)

  # RENAME COLUMNS
  new_cols = {
    "job_title": "Job Title", 
    "company_name": "Company Name",
    "location": "Location (City)",
    "location_state": "Location (State)", 
    "date_posted": "Date Posted (Day, Month, Day of Week)",
    "year": "Date Posted (Year)", 
    "link": "Link"
  }
  
  for col in data.columns:
    if new_cols[col]:
      data = data.rename(columns={col: new_cols[col]})
  
  print("Data transformation completed")
  return data

# LOADING DATA

In [None]:
def load_data(data):
  data.to_csv("fake_jobs - Dan Aremu.csv", index=False)
  print("Data saved successfully")

In [45]:
if not "raw_data" in locals():
  raw_data = fetch_data()
  
if not raw_data.error:
  JOBS = data_extraction(raw_data)

  JOBS = data_transformation(JOBS)
  load_data(JOBS)
else:
  print(raw_data.error)

Data fetched successfully
Data transformation completed
Data saved successfully


In [46]:
# PRINT ALL TRANSFORMED JOBS LIST
JOBS

Unnamed: 0,Job Title,Company Name,Location (City),Location (State),"Date Posted (Day, Month, Day of Week)",Date Posted (Year)
0,Senior Python Developer,"Payne, Roberts and Davis",Stewartbury,AA,"Thursday, 08 April",2021
1,Energy engineer,Vasquez-Davidson,Christopherville,AA,"Thursday, 08 April",2021
2,Legal executive,"Jackson, Chambers and Levy",Port Ericaburgh,AA,"Thursday, 08 April",2021
3,Fitness centre manager,Savage-Bradley,East Seanview,AP,"Thursday, 08 April",2021
4,Product manager,Ramirez Inc,North Jamieview,AP,"Thursday, 08 April",2021
...,...,...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty",Lake Abigail,AE,"Thursday, 08 April",2021
96,"Radiographer, diagnostic",Holder LLC,Jacobshire,AP,"Thursday, 08 April",2021
97,Database administrator,Yates-Ferguson,Port Susan,AE,"Thursday, 08 April",2021
98,Furniture designer,Ortega-Lawrence,North Tiffany,AA,"Thursday, 08 April",2021
