# Extract Tasks

This notebook uses bedrock batch processing to extract a list of tasks associated with
job roles as advertised on civil service jobs. 


In [1]:
import pandas as pd

# Load the data, dropping those roles with missing departments
jobs_data = pd.read_parquet("./data/jobs.pq")
jobs_data["department"] = jobs_data["department"].replace("nan", pd.NA)
jobs_data = jobs_data.dropna(subset=["department"])
print("Large Departments")
large_deparments = jobs_data.value_counts("department").head(10).index.to_list()
print("\n".join(large_deparments))


Large Departments
Ministry of Defence
HM Revenue and Customs
HM Prison & Probation Service
Home Office
Foreign, Commonwealth & Development Office
Ministry of Justice
Cabinet Office
Department for Education
Scottish Government
Department for Work and Pensions


jobs.pq comprises a recent extract of the jobs posted on civil service jobs. Each job 
description contains tasks which can be extracted using a LLM. Each task can also be graded 
according to its potential to be automated with technologies like genAI. Focussing on the 
larger departments, shown above, we can extract a comprehensive list of tasks. 

In [2]:
from typing import Dict
from extraction import create_job, skewer_department
import boto3
from dotenv import load_dotenv

load_dotenv()
boto3.setup_default_session()
session = boto3.Session()


In [None]:
# job_arns: Dict[str,str] = {}
# for department in large_deparments:
#     kebab = skewer_department(department)
#     department_jobs = jobs_data.loc[jobs_data['department'] == department].copy()
#     job_arns.update({kebab: create_job(department_jobs, kebab, "trial-1", session) })




In [3]:
# from pprint import pprint
# pprint(job_arns)

job_arns = {
    "cabinet-office": "arn:aws:bedrock:us-east-1:992382722318:model-invocation-job/jmeegk5rpit7",
    "department-for-education": "arn:aws:bedrock:us-east-1:992382722318:model-invocation-job/7jgo6har1v2k",
    "department-for-work-and-pensions": "arn:aws:bedrock:us-east-1:992382722318:model-invocation-job/1d5s3tp7cuje",
    "foreign-commonwealth-development-office": "arn:aws:bedrock:us-east-1:992382722318:model-invocation-job/e4s08jqd6jtt",
    "hm-prison-probation-service": "arn:aws:bedrock:us-east-1:992382722318:model-invocation-job/0et0ecsnl2mr",
    "hm-revenue-and-customs": "arn:aws:bedrock:us-east-1:992382722318:model-invocation-job/zmrn780b7af9",
    "home-office": "arn:aws:bedrock:us-east-1:992382722318:model-invocation-job/qupta2en20yg",
    "ministry-of-defence": "arn:aws:bedrock:us-east-1:992382722318:model-invocation-job/vxub08igmtcv",
    "ministry-of-justice": "arn:aws:bedrock:us-east-1:992382722318:model-invocation-job/133yd2h4amyo",
    "scottish-government": "arn:aws:bedrock:us-east-1:992382722318:model-invocation-job/hy9y2ah1kzx1",
}

In [None]:
from llmbo import StructuredBatchInferer
from extraction import TaskOutput


session = boto3.Session()
instances = {}
for department, job_arn in job_arns.items():
    sbi = StructuredBatchInferer.recover_structured_job(
        job_arn=job_arn,
        region="us-east-1",
        output_model=TaskOutput,
        session=session,
    )

    sbi.download_results()
    sbi.load_results()

    instances.update({department: sbi})


In [None]:
for department, sbi in instances.items():
    print(
        f"{job_arn} has {len(sbi.instances)} returns {len([i for i in sbi.instances if i])}, fails {len([i for i in sbi.instances if not i])}"
    )

cabinet-office has 7744 returns 7744, fails 0
department-for-education has 7055 returns 7055, fails 0
department-for-work-and-pensions has 6136 returns 6136, fails 0
foreign-commonwealth-development-office has 12828 returns 12826, fails 2
hm-prison-probation-service has 22930 returns 22930, fails 0
hm-revenue-and-customs has 23233 returns 23232, fails 1
home-office has 17887 returns 17887, fails 0
ministry-of-defence has 41259 returns 41258, fails 1
ministry-of-justice has 7875 returns 7875, fails 0
scottish-government has 6649 returns 6649, fails 0


In [5]:
from extraction import convert_tasks_output_to_dataframe

dataframes = {
    department: convert_tasks_output_to_dataframe(sbi.instances)
    for department, sbi in instances.items()
}

In [12]:
for dep, df in dataframes.items():
    df.to_csv(f"./for_drive/{dep}-list-of-tasks.csv")

    stats = df.groupby("vacancy_id")["exposure_score"].agg(
        ["count", "mean", "median", "std"]
    )
    stats.to_csv(f"./for_drive/{dep}-task-summary.csv")

In [14]:
for department in large_deparments:
    kebab = skewer_department(department)
    department_jobs = jobs_data.loc[jobs_data['department'] == department].copy()
    department_jobs.to_csv(f"./for_drive/{kebab}-jobs.csv")
    stats = dataframes[kebab].groupby("vacancy_id")["exposure_score"].agg(
        ["count", "mean", "median", "std"]
    )
    department_jobs.merge(stats, on="vacancy_id", how="left").to_csv(f"./for_drive/{kebab}-jobs-with-stats.csv")

# Running for One department

Use this section if you want to run for a single section 

In [None]:
department = "Department for Business and Trade"
session = boto3.Session()
dept_jobs = jobs_data.loc[jobs_data['department'] == department].copy()
dept_arn = create_job(dept_jobs, skewer_department(department), "trial-1", session)
sbi = StructuredBatchInferer.recover_structured_job(
        job_arn="arn:aws:bedrock:us-east-1:992382722318:model-invocation-job/xq96wu29jj2a",
        region="us-east-1",
        output_model=TaskOutput,
        session=session,
    )


sbi.poll_progress()
sbi.download_results()
sbi.load_results()

dept_jobs.to_csv(f"./for_drive/{skewer_department(department)}-jobs.csv")

df = convert_tasks_output_to_dataframe(sbi.instances)
df.to_csv(f"./for_drive/{skewer_department(department)}-list-of-tasks.csv")

stats = df.groupby("vacancy_id")["exposure_score"].agg(
        ["count", "mean", "median", "std"]
    )
stats.to_csv(f"./for_drive/{skewer_department(department)}-task-summary.csv")

dept_jobs.merge(stats, on="vacancy_id", how="left").to_csv(f"./for_drive/{skewer_department(department)}-jobs-with-stats.csv")