In [None]:
import torch
from transformers import BertForSequenceClassification
from dotenv import load_dotenv

from degree_inference.cah_data import CAHData
from degree_inference.predict import predict
from degree_inference.train import train

load_dotenv() 

from google.cloud import bigquery

%load_ext google.cloud.bigquery

In [None]:
c = CAHData(include_ilr=True, include_gpt_inferences=False, augment=False)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

counts = c.df['label'].value_counts()
plt.hist(counts, bins=15)
plt.title("Distibution of training examples")
plt.xlabel("# of examples")
plt.ylabel("CAH3 codes with this many examples")
# counts, bins = np.histogram(counts)
# plt.stairs(counts, bins)
# counts.plot(kind='bar').set(xlabel=None)
# ax = plt.gca()
# ax.get_xaxis().set_visible(False)
# plt.title("Frequency of CAH3 codes")

In [None]:
%%bigquery degree_name_to_hecos
SELECT
  LOWER(cq.subject) AS text,
  cq.degree_subject_cah_l3 as label
FROM
  `rugged-abacus-218110.dataform_ABS_2_dev.application_choice_details`
LEFT JOIN
  UNNEST(candidate_qualifications) AS cq
LEFT JOIN `rugged-abacus-218110.dfe_reference_data.cah_categories_l3_v2` AS cah_codes ON cah_codes.id = degree_subject_cah_l3
WHERE degree_level IS NOT NULL AND degree_level !='unknown'
AND degree_subject_cah_l3 IS NOT NULL
GROUP BY 
text,
label

In [None]:
data = CAHData(include_ilr=False, augment=False, include_gpt_inferences=False)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data.df['label'].unique()))
mps_device = torch.device("mps")
model.to(mps_device)

trainer = train(model,data,epochs=30,comment="cah")

model.save_pretrained("./models/cah")

data = CAHData(include_ilr=True, augment=False, include_gpt_inferences=False)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data.df['label'].unique()))
mps_device = torch.device("mps")
model.to(mps_device)

trainer = train(model,data,epochs=16,comment="cah-ilr")
model.save_pretrained("./models/cah-ilr")
# model.to("cpu")

data = CAHData(include_ilr=True, augment=False, include_gpt_inferences=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data.df['label'].unique()))
mps_device = torch.device("mps")
model.to(mps_device)

trainer = train(model,data,epochs=12,comment="cah-ilr-gpt")

model.save_pretrained("./models/cah-ilr-gpt")

data = CAHData(include_ilr=True, augment=True, include_gpt_inferences=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data.df['label'].unique()))
mps_device = torch.device("mps")
model.to(mps_device)

trainer = train(model,data,epochs=12,comment="cah-ilr-gpt-augmented")

model.save_pretrained("./models/cah-ilr-gpt-augmented")

In [None]:
out = predict(data, model,list(degree_name_to_hecos['text']))
out

In [None]:
model.save_pretrained("./models/cah-ilr-gpt-augmented")

In [None]:
model.save_pretrained("./cah-ilr-gpt-augmented")

In [None]:
data.df['label'].unique()


In [None]:
%%bigquery unlabelled
SELECT
        cq.subject AS degree_subject,
        cq.degree_subject_cah_l3,
        cah_codes.name AS cah_category_name
    FROM
        `rugged-abacus-218110.dataform_ABS_2_dev.application_choice_details`
    LEFT JOIN
        UNNEST(candidate_qualifications) AS cq
    LEFT JOIN `rugged-abacus-218110.dfe_reference_data.cah_categories_l3_v2` AS cah_codes ON cah_codes.id = degree_subject_cah_l3
    WHERE degree_level IS NOT NULL AND degree_level !='unknown'
    AND nationality_group = "British"
    AND degree_subject_cah_l3 IS NULL
    GROUP BY
        degree_subject,
        degree_subject_cah_l3,
        cah_category_name
    ORDER BY RAND()
    LIMIT 100


In [None]:
# Inference

# load pretrained model
model = BertForSequenceClassification.from_pretrained('./30-epoch-gpt2-ilr-augmented-1e-5/', num_labels=len(data.df['label'].unique()))
model.to("cpu")

out = predict(data, model,list(unlabelled.degree_subject))
for row in out:
    print(row)