#### Callibrate decision thresholds for company classification 

The following notebook is designed to provide a quick way to run precision metrics on the classified DataFrames outputted by the openAI batch processing. 
The high-level implementation follows the logic of: 
1. Merging the binary and probabilistic results
2. Apply boolean mask to binary classification 'answer' column
3. Transform 'Probabability' and 'answer' columns to numpy arrays y_true and y_pred
4. Visualise PR curve to identify consistencies in models decision boundary 

In other words we aim to visualise how well the models binary classification boundaires allign with its probabaistic prediction confidence. This should improve the overall performance of the model when running real test data by ensuring that the the cut-off theshold is set to minimise the actual decision uncertainty. 


In [None]:
import os 
from dotenv import load_dotenv
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from precision_recall import *
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, ConfusionMatrixDisplay

In [None]:
load_dotenv()

DATA_PATH = os.getenv("DATA_DIR")

file_path_1 = os.path.join(DATA_PATH, 'Electronics_batch_P_test.csv')
file_path_2 = os.path.join(DATA_PATH, 'Electronics_batch_test2.xlsx')


In [None]:
df_binary = pd.read_csv(file_path_2)

df_prob = pd.read_csv(file_path_1)


merged_df = pd.merge(
    df_binary,
    df_prob[['org_ID', 'Probability']],
    on='org_ID',
    how='left'
)

merged_df= merged_df[['Probability', 'answer','org_ID','organisation_name', 'description', 'short_description', 'y_true']]


In [None]:
merged_df = pd.read_excel(file_path_2)

y_true = merged_df['y_true'].to_numpy()

y_pred = merged_df['answer'].to_numpy()

In [None]:
label_map = {'no': 0, 'yes': 1}

y_pred = np.array([label_map[x] for x in y_pred])

precision, recall, thresholds = precision_recall_curve(y_true, y_pred)

# Plot the PR curve (precision vs recall)
plt.figure(figsize=(6,4))
plt.plot(recall, precision, marker='.', label='PR curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('PR curve electronics n=368')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)  
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

print(f"optimal threshold (F1): {best_threshold:.3f}")
print(f"Precision: {precision[best_idx]:.3f}")
print(f"Recall:    {recall[best_idx]:.3f}")
print(f"F1:        {f1_scores[best_idx]:.3f}")

In [None]:
y_pred_binary = (y_pred >= 0.5).astype(int)

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred_binary)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix (threshold=0.5)")
plt.show()

In [None]:
cm = precision_recall(y_true, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

##### Generate a validation dataset composed of multiple ecosystems

The following code aims to acheive the following for each ecosystem: 
1. Construct an unbalanced labelled dataset containing companies in agrifood, tourism, and textiles
2. Dummy code each ecosystem in the 'source' column to generate y_true array (according to each ecosytem classification run)
3. Boolean transform on answer column from 'yes' / 'no' to 1 / 0 to generate y_pred array
4. Compute precision metrics on arrays 
5. generate visualisation as a TP, FP, FN, TN bar plot

In [None]:
xls = pd.ExcelFile(file_path)
dfs = {sheet_name: xls.parse(sheet_name) for sheet_name in xls.sheet_names}
agri_df = dfs['Agrifood']
tourism_df = dfs['Tourism']
textile_df = dfs['Textiles']

In [None]:
industry_dummy = {
    'Agrifood': {'Agrifood': 1, 'Tourism': 0, 'Textiles': 0},
    'Tourism':  {'Agrifood': 0, 'Tourism': 1, 'Textiles': 0},
    'Textiles': {'Agrifood': 0, 'Tourism': 0, 'Textiles': 1}
}

label_map_pred = {'no': 0, 'yes': 1}

precision_scores = {}

for indsutry in ['Agrifood', 'Tourism', 'Textiles']:
    df = dfs[indsutry]
    label_df = df[['org_ID', 'answer', 'source']]
    
    # Map predictions
    y_pred = label_df['answer'].map(label_map_pred).to_numpy()
    y_true = label_df['source'].map(industry_dummy[indsutry]).to_numpy()
    
    print(f"\nSector: {indsutry}")
    precision_recall(y_true, y_pred)

    

In [None]:

tp = (y_true == 1) & (y_pred == 1)   
fn = (y_true == 1) & (y_pred == 0)   
fp = (y_true == 0) & (y_pred == 1)
tn = (y_true == 0) & (y_pred == 0) 


outcomes = np.zeros_like(y_true, dtype='<U2')
outcomes[tp] = 'TP'
outcomes[fn] = 'FN'
outcomes[fp] = 'FP'
outcomes[tn] = 'TN'


counts = Counter(outcomes)
labels = ['TP', 'FP', 'FN', 'TN']
values = [counts.get(label, 0) for label in labels]

plt.figure(figsize=(7, 5))
plt.bar(labels, values, color=['green', 'orange', 'red', 'blue'])
plt.title('Model prediction outcome')
plt.ylabel('Count')
plt.show()
