In [1]:
import os
import requests
import zipfile
from bs4 import BeautifulSoup
import tempfile
import glob
import pandas as pd

In [2]:
page = 'https://explore-education-statistics.service.gov.uk/find-statistics/apprenticeships-and-traineeships/2022-23'

In [3]:
r = requests.get(page)
soup = BeautifulSoup(r.text)
zip_link = soup.find('a', string='Download all data (zip)')['href']
zip_link

'https://content.explore-education-statistics.service.gov.uk/api/releases/576fdd7a-e24f-4e71-9c84-050f23ac4a23/files'

In [4]:
local_zip = os.path.abspath('../working/data.zip')

In [5]:
if not os.path.exists(local_zip):
    r = requests.get(zip_link)
    with open(local_zip, 'wb') as f:
        f.write(r.content)

In [6]:
with tempfile.TemporaryDirectory() as tmp:
    current_dir = os.getcwd()
    try:
        os.chdir(tmp)
        with zipfile.ZipFile(local_zip, 'r') as zip:
            zip.extractall('.')
        print('\n'.join(glob.glob('**', recursive=True)))
        data = pd.read_csv('data/app-starts-since-202223-q3.csv')
        # data = pd.read_csv(f'{dataset}.txt', sep='\t',
                          #  header=None, names=fields)
    finally:
        os.chdir(current_dir)

data-guidance
data-guidance/data-guidance.txt
data
data/app-pubsec-summary-2023.csv
data/app-tship-demographics-full-year-202122-q4.csv
data/app-redundancies-202223-jul.csv
data/app-latest-summary-full-year-202223-q3.csv
data/app-starts-since-202223-q3.csv
data/apps_narts_learner_detailed.csv
data/app-learner-detailed-202223-q3.csv
data/apps-geography-population-2223-q3.csv
data/app-learner-deprivation-part-202223-q3.csv
data/app-tship-start-comp-conv-2223-q2.csv
data/app-tship-demographics-202223-q3.csv
data/apps_narts_subject_and_level_detailed.csv
data/app-duration-staylength-emplength-202122-q4.csv
data/app-vacancies-adverts-2023-jul-20230710.csv
data/apps_narts_deprivation.csv
data/apps_narts_provider_level_fwk_std.csv
data/app-tship-provider-202223-q3.csv
data/app-learner-lldd-202223-q3.csv
data/app-learner-deprivation-starts-202223-q3.csv
data/apps_narts_provider_type.csv
data/app-geography-detailed-202223-q3.csv
data/app-latest-summary-202223-q3.csv
data/data-for-graphs-2122-20

In [7]:
filtered = (
  data
    .loc[data.geographic_level == 'Local authority', :]
    .drop(columns=['pcon_code', 'pcon_name', 'old_la_code', 'geographic_level'])
    .rename(columns={'new_la_code': 'geo_code', 'la_name': 'geo_name'})
)
filtered['date'] = filtered.time_period.astype(str).str.slice(0,4).apply(pd.to_datetime) + pd.DateOffset(months=8, years=1, days=-1)
export = (
  filtered
    .loc[:, ['date', 'geo_code', 'starts']]
    .melt(id_vars=['geo_code', 'date'])
)

In [8]:
export.to_parquet('../static/data.parquet')
export.to_parquet('../static/data', partition_cols=['geo_code'])

In [9]:
export.dtypes

geo_code            object
date        datetime64[ns]
variable            object
value                int64
dtype: object