In [17]:
import pandas as pd
from openai import AsyncOpenAI
from dotenv import load_dotenv
import asyncio

load_dotenv()

True

In [None]:
import json

# Path to Grammatical Range and Accuracy JSONL file
file_path = '/home/ducanh/nvidia-llm-pipeline/unslot/ft_qwen/outputs/gap.json'
gra_data = []

# Open and read the JSONL file
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Parse each line as JSON
        gra_data.append(json.loads(line.strip()))

# Path to Lexical Resource JSONL file
file_path = 'ielts_speech_ft_preds/peft_prediction_lr_llama_070525.jsonl'
lr_data = []

# Open and read the JSONL file
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Parse each line as JSON
        lr_data.append(json.loads(line.strip()))


In [4]:
# Linear Regression
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import pandas as pd

def get_linest(y_values_poly, x_values_poly):
    # Convert to NumPy arrays and reshape for sklearn
    X_poly = np.array(x_values_poly).reshape(-1, 1)
    Y_poly = np.array(y_values_poly)

    # Define the polynomial degree (degree 2 for quadratic fit)
    degree = 1

    # Transform the data to include polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_poly_transformed = poly.fit_transform(X_poly)

    # Fit the polynomial regression model
    model_poly = LinearRegression()
    model_poly.fit(X_poly_transformed, Y_poly)

    return model_poly.coef_, model_poly.intercept_

def get_mapping(coefs, intercept, x_values):
    return [max(1,min(9,round(intercept+sum([coef*x_val**degree for degree, coef in enumerate(coefs)])))) for x_val in x_values]

In [8]:
async def get_gra_score_pred(input_text):
    client = AsyncOpenAI()
    json_schema_gra = {
    "name": "grammatical_range_and_accuracy_score",
    "schema": {
        "type": "object",
        "properties": {
        "grammatical_range_and_accuracy": {
            "type": "integer",
            "description": "The overall score representing the grammatical range and accuracy."
        },
        },
        "required": [
        "grammatical_range_and_accuracy",
        ],
        "additionalProperties": False
    },
    "strict": True
    }
    response_score = await client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system", "content":"Convert the score into JSON."},{"role":"user", "content":input_text}],
        temperature=0,
        top_p=1,
        max_tokens=4096,
        response_format={"type": "json_schema", "json_schema":json_schema_gra}
    )
    return json.loads(response_score.choices[0].message.content)['grammatical_range_and_accuracy']

async def get_lr_score_pred(input_text):
    client = AsyncOpenAI()
    json_schema_lr = {
    "name": "lexical_resource_score",
    "schema": {
        "type": "object",
        "properties": {
        "lexical_resource": {
            "type": "integer",
            "description": "The overall score representing the lexical resource."
        },
        },
        "required": [
        "lexical_resource",
        ],
        "additionalProperties": False
    },
    "strict": True
    }
    response_score = await client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system", "content":"Convert the score into JSON."},{"role":"user", "content":input_text}],
        temperature=0,
        top_p=1,
        max_tokens=4096,
        response_format={"type": "json_schema", "json_schema":json_schema_lr}
    )
    return json.loads(response_score.choices[0].message.content)['lexical_resource']

In [None]:
# Assuming the JSONL file has 'input', 'label', and 'prediction' fields
gra_input = [gra_item['input'] for gra_item in gra_data]
gra_target = [float(gra_item['label'].split(" ")[-1]) for gra_item in gra_data]
# Getting score from prediction string using OpenAI, use own prediction extraction method if available
gra_pred_tasks = [get_gra_score_pred(gra_item['prediction']) for gra_item in gra_data]
gra_preds = await asyncio.gather(*gra_pred_tasks)
# Prediction score extraction end
gra_linest = get_linest(gra_target, gra_preds)

lr_input = [lr_item['input'] for lr_item in lr_data]
lr_target = [float(lr_item['label'].split(" ")[-1]) for lr_item in lr_data]
# Getting score from prediction string using OpenAI, use own prediction extraction method if available
lr_pred_tasks = [get_lr_score_pred(lr_item['prediction']) for lr_item in lr_data]
lr_preds = await asyncio.gather(*lr_pred_tasks)
# Prediction score extraction end
lr_linest = get_linest(lr_target, lr_preds)

gra_df = pd.DataFrame({
    "input": gra_input,
    "target": gra_target,
    "pred": gra_preds,
    "acc":[item_target-1<=item<=item_target+1 for item_target,item in zip(gra_target,gra_preds)],
    "pred_mapped": get_mapping(gra_linest[0], gra_linest[1], gra_preds),
    "acc_mapped": [item_target-1<=item<=item_target+1 for item_target,item in zip(gra_target,get_mapping(gra_linest[0], gra_linest[1], gra_preds))],
})

lr_df = pd.DataFrame({
    "input": lr_input,
    "target": lr_target,
    "pred": lr_preds,
    "acc":[item_target-1<=item<=item_target+1 for item_target,item in zip(lr_target,lr_preds)],
    "pred_mapped": get_mapping(lr_linest[0], lr_linest[1], lr_preds),
    "acc_mapped": [item_target-1<=item<=item_target+1 for item_target,item in zip(lr_target,get_mapping(lr_linest[0], lr_linest[1], lr_preds))],
})

In [None]:
print(f"Grammatical Range and Accuracy mapping coef and intercept:\nCoefficients: {gra_linest[0]}\nIntercept: {gra_linest[1]}")
print()
print(f"Lexical Resource mapping coef and intercept:\nCoefficients: {lr_linest[0]}\nIntercept: {lr_linest[1]}")
# Counting accurate predictions/total predictions
print(f"Grammatical Range and Accuracy rubric accuracy:\nUnmapped: {round(gra_df['acc'].sum()/gra_df['acc'].count()*100,2)}%\nMapped: {round(gra_df['acc_mapped'].sum()/gra_df['acc_mapped'].count()*100,2)}%")
print()
print(f"Lexical Resource rubric accuracy:\nUnmapped: {round(lr_df['acc'].sum()/lr_df['acc'].count()*100,2)}%\nMapped: {round(lr_df['acc_mapped'].sum()/lr_df['acc_mapped'].count()*100,2)}%")

Grammatical Range and Accuracy mapping coef and intercept:
Coefficients: [0.         0.72480106]
Intercept: 3.217506631299734

Lexical Resource mapping coef and intercept:
Coefficients: [0.        1.0825701]
Intercept: 2.782020572072704


Grammatical Range and Accuracy rubric accuracy:
Unmapped: 31.48%
Mapped: 64.81%

Lexical Resource rubric accuracy:
Unmapped: 12.96%
Mapped: 66.67%


In [None]:
# Saving the DFs as excel
gra_df.to_excel("ielts_speech_ft_preds/GRA evaluation result.xlsx")
lr_df.to_excel("ielts_speech_ft_preds/LR evaluation result.xlsx")