In [None]:
import psycopg2
import os
from dotenv import load_dotenv
import pandas as pd
from matplotlib.pyplot import subplots

load_dotenv()

database_password = os.environ.get("DATABASE_PASSWORD")
database_username = os.environ.get("DATABASE_USERNAME")
database_host = os.environ.get("DATABASE_HOST")
database_port = os.environ.get("DATABASE_PORT")
database_name = os.environ.get("DATABASE_NAME")

connection = psycopg2.connect(database=database_name,
                        host=database_host,
                        user=database_username,
                        password=database_password,
                        port=database_port)

In [None]:
def sql_to_dataframe(conn, query) -> pd.DataFrame:
    """
    Import data from a PostgreSQL database using a SELECT query
    """
    cursor = conn.cursor()
    try:
        cursor.execute(query)
    except (Exception, psycopg2.DatabaseError) as error:
        print(f"Error: {error}”")
        cursor.close()
        return 1
    # The execute returns a list of tuples:
    tuples_list = cursor.fetchall()
    colnames = [desc[0] for desc in cursor.description]
    cursor.close()
    # Now we need to transform the list into a pandas DataFrame:
    df = pd.DataFrame(tuples_list, columns=colnames)
    return df

# Jobs live on Wednesdays

In [None]:
jobs_live_df = sql_to_dataframe(conn=connection, query="""SELECT DATE(checked_time), COUNT(DISTINCT scrapeid)
	FROM (SELECT * FROM sitemap_entries
		WHERE checked_time > '2024-01-10' AND NOT checked_time = '2024-02-10 01:36:09.964627+00'
		ORDER BY checked_time ASC) m
GROUP BY DATE(checked_time)""")

In [None]:
jobs_live_df['date'] = pd.to_datetime(jobs_live_df['date'])

# Filter the DataFrame to only include Wednesdays (where dayofweek == 2)
wednesdays_df = jobs_live_df[jobs_live_df['date'].dt.dayofweek == 2]

In [None]:
fig, ax = subplots(figsize=(10,6))

ax.plot(wednesdays_df["date"], wednesdays_df["count"], marker="+", markeredgecolor="red")

ax.set_ylim(bottom=0)

ax.set_title("Number of jobs live on Civil Service Jobs")

ax.set_ylabel("Number of jobs live")

ax.set_xticks(wednesdays_df["date"])
ax.tick_params(axis='x', rotation=70)

fig.subplots_adjust(bottom=0.25)

fig.text(0.5, 0.01, "(Quick, non peer-reviewed visualisation based on unofficial once-daily web scraping. \n Civil Service Jobs data is available under the Open Government Licence v3.0)", ha="center")

# Total new jobs added per day

In [None]:
earliest_times_per_jcode_df = sql_to_dataframe(conn=connection, query="SELECT jcode, COUNT(DISTINCT scrapeid), MIN(checked_time) AS earliest_checked_time, MIN(updated_time) AS earliest_updated_time FROM sitemap_entries GROUP BY jcode")

In [None]:
earliest_times_per_jcode_df["earliest_updated_time"] = pd.to_datetime(earliest_times_per_jcode_df["earliest_updated_time"], utc=True)

df_counts_by_date = earliest_times_per_jcode_df.groupby(earliest_times_per_jcode_df["earliest_updated_time"].dt.date).size().reset_index(name="count")

In [None]:
df_counts_by_date["earliest_updated_time"] = pd.to_datetime(df_counts_by_date["earliest_updated_time"])
df_counts_by_date = df_counts_by_date[df_counts_by_date["earliest_updated_time"] >= pd.to_datetime("2024-01-11")]
df_counts_by_date.set_index("earliest_updated_time", inplace=True)

In [None]:
df_counts_by_date = df_counts_by_date.asfreq('D', fill_value=0)

In [None]:
fig, ax = subplots(figsize=(10,6))

ax.plot(df_counts_by_date.index, df_counts_by_date["count"])

ax.set_ylim(bottom=0)

ax.set_title("Number of new jobs posted on Civil Service Jobs per day")

ax.set_ylabel("Number of new jobs posted")

fig.subplots_adjust(bottom=0.15)

fig.text(0.5, 0.01, "(Quick, non peer-reviewed visualisation based on unofficial once-daily web scraping. \n Civil Service Jobs data is available under the Open Government Licence v3.0)", ha="center")

# Total new jobs added per week

In [None]:
earliest_times_per_jcode_df_for_week = sql_to_dataframe(conn=connection, query="SELECT jcode, COUNT(DISTINCT scrapeid), MIN(checked_time) AS earliest_checked_time, MIN(updated_time) AS earliest_updated_time FROM sitemap_entries GROUP BY jcode")

In [None]:
earliest_times_per_jcode_df_for_week["earliest_updated_time"] = pd.to_datetime(earliest_times_per_jcode_df_for_week["earliest_updated_time"], utc=True)
df_counts_by_week = earliest_times_per_jcode_df_for_week.groupby(earliest_times_per_jcode_df_for_week["earliest_updated_time"].dt.to_period('W')).size().to_timestamp().reset_index(name="count")

In [None]:
df_counts_by_week["earliest_updated_time"] = pd.to_datetime(df_counts_by_week["earliest_updated_time"])
df_counts_by_week = df_counts_by_week[df_counts_by_week["earliest_updated_time"] >= pd.to_datetime("2024-01-15")]
df_counts_by_week = df_counts_by_week[df_counts_by_week["earliest_updated_time"] <= pd.to_datetime("2024-03-24")]
df_counts_by_week.set_index("earliest_updated_time", inplace=True)

In [None]:
df_counts_by_week = df_counts_by_week.asfreq('W-MON', fill_value=0)

In [None]:
fig, ax = subplots(figsize=(14,6))

ax.plot(df_counts_by_week.index, df_counts_by_week["count"], marker="+", markeredgecolor="red")

ax.set_ylim(bottom=0)

ax.set_title("Number of new jobs posted on Civil Service Jobs per week")

ax.set_ylabel("Number of new jobs posted")

ax.set_xticks(df_counts_by_week.index)

fig.subplots_adjust(bottom=0.15)

fig.text(0.5, 0.01, "(Quick, non peer-reviewed visualisation based on unofficial once-daily web scraping. \n Civil Service Jobs data is available under the Open Government Licence v3.0)", ha="center")

# Jobs live on Wednesdays, for a specific department

In [None]:
earliest_times_per_jcode_df_for_week = sql_to_dataframe(conn=connection, query="SELECT jcode, COUNT(DISTINCT scrapeid), MIN(checked_time) AS earliest_checked_time, MIN(updated_time) AS earliest_updated_time FROM sitemap_entries GROUP BY jcode")