In [None]:
import psycopg2
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup

load_dotenv()

database_password = os.environ.get("DATABASE_PASSWORD")
database_username = os.environ.get("DATABASE_USERNAME")
database_host = os.environ.get("DATABASE_HOST")
database_port = os.environ.get("DATABASE_PORT")
database_name = os.environ.get("DATABASE_NAME")

connection = psycopg2.connect(
    database=database_name,
    host=database_host,
    user=database_username,
    password=database_password,
    port=database_port,
)

In [None]:
def create_table_if_does_not_exist(conn, table_name: str, create_table_sql: str) -> int:
    try:
        cursor = conn.cursor()
        cursor.execute(
            "SELECT EXISTS(SELECT * FROM information_schema.tables WHERE table_name=%s);",
            (table_name,),
        )

        exists = cursor.fetchone()[0]

        if not exists:
            cursor.execute(create_table_sql)
            conn.commit()
            print(f"Table {table_name} created successfully")
        else:
            print(f"Table {table_name} already exists")

        cursor.close()
        return 0

    except (Exception, psycopg2.DatabaseError) as error:
        print(f"Error: {error}”")
        connection.rollback()
        cursor.close()
        return 1

In [None]:
# Create scrapes_extracted table
create_scrapes_extracted_sql = (
    """CREATE TABLE scrapes_extracted (scrape_id INT PRIMARY KEY)"""
)

create_table_if_does_not_exist(
    conn=connection,
    table_name="scrapes_extracted",
    create_table_sql=create_scrapes_extracted_sql,
)

In [None]:
# Create departments table
create_departments_sql = """CREATE TABLE Departments (
department_id SERIAL PRIMARY KEY,
department_name TEXT UNIQUE NOT NULL
)"""

create_table_if_does_not_exist(
    conn=connection,
    table_name="departments",
    create_table_sql=create_departments_sql,
)

In [None]:
# Add columns to scrapes_extracted
cursor = connection.cursor()
try:
    cursor.execute("""ALTER TABLE scrapes_extracted
                   ADD COLUMN IF NOT EXISTS title TEXT,
                   ADD COLUMN IF NOT EXISTS department_id INTEGER;

                   ALTER TABLE scrapes_extracted DROP CONSTRAINT IF EXISTS fk_department;
                   
                   ALTER TABLE scrapes_extracted ADD CONSTRAINT fk_department
                   FOREIGN KEY (department_id)
                   REFERENCES Departments(department_id)""")
    
except (Exception, psycopg2.DatabaseError) as error:
    print(f"Error: {error}”")
cursor.close()
connection.commit()

In [None]:
def extract_job_title_from_html(html_content: str) -> str | None:
    soup = BeautifulSoup(markup=html_content, features="lxml")
    job_title = soup.find("h1")
    if job_title:
        return job_title.text.strip()
    else:
        return None
    
def extract_department_from_html(html_content: str) -> str | None:
    soup = BeautifulSoup(markup=html_content, features="lxml")
    department_name = soup.find("p", class_="csr-page-subtitle")
    if department_name:
        return department_name.text.strip()
    else:
        return None

In [None]:
# scrape job title
cursor = connection.cursor()
try:
    cursor.execute("""SELECT scrapes.scrape_id, scrapes.html
FROM scrapes
LEFT JOIN scrapes_extracted on scrapes.scrape_id = scrapes_extracted.scrape_id
WHERE scrapes_extracted.title IS NULL
AND scrapes.scrape_id > 5
""")
except (Exception, psycopg2.DatabaseError) as error:
    print(f"Error: {error}”")
    cursor.close()

for row in cursor.fetchall():
    job_title = extract_job_title_from_html(row[1])
    if job_title:
        print(job_title)
        try:
            cursor.execute(
                """INSERT INTO scrapes_extracted (scrape_id, title)
                           VALUES (%s, %s)""",
                (int(row[0]), job_title),
            )
        except (Exception, psycopg2.DatabaseError) as error:
            print(f"Error: {error}”")
    else:
        print(f"Job title not found for scrape_id {row[0]}")

cursor.close()
connection.commit()

In [None]:
# scrape job department

def get_or_create_department(department_name: str, cursor):
    cursor.execute("SELECT department_id from Departments WHERE department_name = %s", (department_name,))
    result = cursor.fetchone()
    if result:
        return result[0]
    else:
        cursor.execute("INSERT INTO Departments (department_name) VALUES (%s) RETURNING department_id", (department_name,))
        return cursor.fetchone()[0]

def update_scrapes_extracted_with_department(scrape_id, department_id, cursor):
    cursor.execute("UPDATE scrapes_extracted SET department_id = %s WHERE scrape_id = %s", (department_id, scrape_id))

cursor = connection.cursor()
try:
    cursor.execute("""SELECT scrapes.scrape_id, scrapes.html
FROM scrapes
LEFT JOIN scrapes_extracted on scrapes.scrape_id = scrapes_extracted.scrape_id
WHERE scrapes_extracted.department_id IS NULL
AND scrapes.scrape_id > 5
""")
except (Exception, psycopg2.DatabaseError) as error:
    print(f"Error: {error}”")
    cursor.close()
print(cursor.arraysize)
for scrape_id, scrape_html in cursor.fetchall():
    department = extract_department_from_html(scrape_html)
    if department:
        print(department)
        department_id = get_or_create_department(department_name=department, cursor=cursor)
        print(department_id)
        update_scrapes_extracted_with_department(scrape_id=scrape_id, department_id=department_id, cursor=cursor)
    else:
        print(f"Department not found for scrape_id {scrape_id}")

cursor.close()
connection.commit()