## Rate with LLM

In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.ndimage import gaussian_filter
from scipy.stats import pearsonr

In [None]:
from langchain.schema import HumanMessage, SystemMessage
from langchain_ollama.llms import OllamaLLM

In [None]:
m0 = pd.read_csv("./mohler_dataset_edited.csv")
display(m0.head())

In [None]:
system_prompt = """
You are an automated grader specialized in evaluating student responses. You follow a grading scale from 0 to 1.  

General Grading Scale:  
1. **Score = 1.0**: All **essential concepts** (EC) are present and clearly explained, no major errors, and the answer is complete.  
2. **Score of 0.8-0.9**: The ECs are present, but the answer has slight inaccuracies or minor omissions.  
3. **Score of 0.6-0.7**: Some ECs are missing or ambiguously formulated, showing general understanding but incompleteness.  
4. **Score of 0.3-0.5**: The answer is partial or unclear, with several ECs missing or poorly addressed.  
5. **Score of 0.1-0.2**: Largely off-topic or very incomplete, with only a small correct element.  
6. **Score = 0**: No answer or entirely off-topic.  

You may apply **bonuses** for clarity, precision, or mentioning additional important concepts and **penalties** for serious confusion or contradictions.  

At the end, you must **assign a score from 0 to 1** based on the grading scale.
"""

In [None]:
user_prompt_template = """
User (or Instruction)  
Task:  
1. Read the **question** posed to the student.  
2. Review the **expected answer** (i.e., the reference solution).  
3. Examine the **student's response**.  
4. Assign a grade (a floating-point number between 0 and 1) according to the **rules** and **grading tiers** above.  

### Elements to Evaluate  
- Presence of **essential concepts** (mention them if possible)  
- Accuracy and clarity of the formulation  
- Any confusion or errors  
- The relevance of any secondary concepts (bonus or penalty)  

### Example Input Format  
<in>  
    {example_input} 
</in>  

### Expected Final Output Format  
<out>  
    {example_output} 
</out>  

Ensure that the expected response is enclosed between <out></out> and is a dictionary where the keys follow the format `"student x grade"`, where `"x"` is the student's number and the value is the grade you gave to him. Do not add any additional comments to the output. Only return:  
<out> [YOUR RESPONSE HERE] </out>  

Here is the current input:  
<in>  
{current_input}
</in>  
"""

In [None]:
example_input ="""
    Student 1:  
    - **Question**: [What is the role of a prototype program in problem solving?]  
    - **Expected answer**: [To address major issues in the creation of the program. There is no way to account for all possible bugs in the program, but it is possible to prove the program is tangible.]  
    - **Student's response**: [To simulate the behaviour of portions of the desired software product.]  

    Student 2:  
    - **Question**: [What is a pointer?]
    - **Expected answer**: [The address of a location in memory.] 
    - **Student's response**: [A pointer is a variable that points to the address location of another variable.  Represented by (*).]"""

In [None]:
example_output = """{"etudiant 1 grade": 0.5, "etudiant 2 grade": 1}"""

In [None]:
llm_llama = OllamaLLM(model="llama3.1")

In [None]:
etudiants = []
expected_scores = []
counter = 0
for i in range(m0.shape[0]):
    # if m0.loc[i, "desired_answer"].find("#NAME?") != -1 or m0.loc[i, "student_answer"].find("#NAME?") != -1:
    #     continue
    etudiants.append(f"""
    Etudiant {i+1}:
    - **Question** : [{m0.loc[i, "question"]}]
    - **Réponse attendue** : [{m0.loc[i, "desired_answer"]}]
    - **Réponse de l’étudiant** : [{m0.loc[i, "student_answer"]}]
    """)
    expected_scores.append(m0.loc[i, 'score_avg'])
    counter += 1

In [None]:
prompts = []
scores_groups = []
messages_group = []
for i in range(0, len(etudiants), 10):
    candidates = "\n".join(etudiants[i:i+10])
    user_prompt = user_prompt_template.format(example_input=example_input, example_output=example_output, current_input=candidates)
    messages_group.append([SystemMessage(content=system_prompt), HumanMessage(content=user_prompt)])
    
    # Extraire et traiter le résultat pour cet étudiant
    scores_groups.append(expected_scores[i:i+10])
    prompts.append(user_prompt)

In [None]:
import pickle

In [None]:
run = True #Change to fals if the file is not already defined
if run == True:
    responses_llama = {}
    counter = 0
    for messages in (messages_group):
        counter += 1
        responses_llama[counter] = llm_llama.invoke(messages_group[counter - 1])
    with open("mohler_llama_grade.pkl", "wb") as file:
        pickle.dump(responses_llama, file)
else:
    with open("mohler_llama_grade.pkl", "rb") as file:
        responses_llama = pickle.load(file)

In [None]:
import re

def extract_number(text):
    match_obj = re.search(r'\d+', text)
    return int(match_obj.group()) if match_obj else None

def extract_brace_content(text):
    match_obj = re.search(r'\{(.*?)\}', text)
    if match_obj != None:
        return "{%s}"%(match_obj.group(1)) if match_obj else None
    else:
        start_pos = text.find("{")
        end_pos = text.find("}")
        if not (-1 in [start_pos, end_pos]):
            return text[start_pos:end_pos+1]
        return None

In [None]:
def extract_grades(data:list):
    llm_scores = []
    students_llm_scores = {}
    failed_index = []

    for i in range(len(data)):
        try:
            response_text = data[i+1].content
            for string in ['```json', '```', '<out>', '</out>', '\\']:
                response_text = response_text.replace(string, '')
                
            response_text = extract_brace_content(response_text)
            response_dict = json.loads(response_text)

            temp_scores = []
            for key in sorted(response_dict.keys()):
                student_id = extract_number(key)
                students_llm_scores[student_id] = response_dict[key]
                temp_scores.append(response_dict[key])
            llm_scores.append(temp_scores)
        except:
            failed_index.append(i+1)
        
    return {
        "llm_score": llm_scores.copy(),
        "students_llm_score": students_llm_scores.copy(),
        "failed_indexes": failed_index.copy()
    }


In [None]:

models_responses = {'llama':responses_llama} 
models_students_scores = {}
for model_name in models_responses:
    out_data = extract_grades(models_responses[model_name])
    students_llm_scores:dict = out_data['students_llm_score']
    failed_indexes = out_data['failed_indexes']

    print(f"\n----- Model: {model_name} -----\n")
    print(f"Failed indexes : [", *failed_indexes, "]")

    students_ids, students_scores =  zip(*sorted(students_llm_scores.items(), key=lambda item: item[0]))
    models_students_scores[model_name] = np.array(students_scores)
    print(f"Correlation: ", pearsonr(students_scores, np.concatenate(scores_groups)))
    


In [None]:
def rmse(y_pred, y_true):
    return np.sqrt(np.nanmean((y_true - y_pred) ** 2))

### Gaussian smoothing and linear interpolation

In [None]:
plt.figure(figsize=(12, 4))
# Add noise to increase visibility
noise_grade = np.random.normal(0, 0.01, len(models_students_scores['llama']))  # Noise for students_scores
noise_score = np.random.normal(0, 0.1, len(m0['score_avg']))  # Noise for m0['score_avg']

for index, model_name in enumerate(models_students_scores):
    plt.subplot(1, 3, index+1)
    plt.scatter(models_students_scores[model_name] + noise_grade, m0['score_avg'] + noise_score, c='black', s=2)
    plt.title(f'Score distribution with grade for {model_name}')
    plt.xlabel('LLM Grade')
    plt.ylabel('Score')
    plt.xlim(-.10, 1.10)
    plt.ylim(-.50, 5.20)

# Displaying
plt.tight_layout()
plt.show()


In [None]:
i_set = m0['id'].unique()

In [None]:
for model_name in models_students_scores:
    print(f"Distribution of scores per question with {model_name.upper()}")
    fig, axes = plt.subplots(2, 3, figsize=(12, 6), constrained_layout=True)

    for ax, question_id in zip(axes.flatten(), i_set[[1, 2, 3 , 77, 78, 79]]):
        mask = m0['id'] == question_id
        ax.scatter(models_students_scores[model_name][mask] + np.random.normal(0, 0.001, mask.sum()),
                m0.loc[mask, 'score_avg'] + np.random.normal(0, 0.01, mask.sum()),
                s=20, marker='o', color='black', alpha=0.5)
        ax.set_title(f"Question {question_id}")
        ax.set_xlabel("LLM Grade")
        ax.set_ylabel("Score")
        ax.set_xlim(-.10, 1.10)
        ax.set_ylim(-.20, 5.20)
        # ax.grid(True)
    plt.show()
    print()


In [None]:
for model_name in models_students_scores:
    print(f"\nScores prediction with {model_name.upper()}") 
    # Étape 1 : Créer m1 en supprimant la colonne 'desired_answer' et en ajoutant res0
    m1 = m0.drop(columns=['desired_answer'])  # Supprimer 'desired_answer'
    m1.insert(m1.columns.get_loc('question'), 'res0', models_students_scores[model_name])  # Ajouter 'res0' avant 'question'

    q = m1['id'].unique()  # Identifiants uniques des questions

    # Étape 3 : Configurer la disposition pour les graphiques
    fig, axes = plt.subplots(2, 3, figsize=(12, 6), constrained_layout=True)

    # Définir la fenêtre pour le lissage
    window = 0.4  # Cette valeur sera utilisée plus tard dans les calculs

    # Étape 1 : Définir l'intervalle des questions à traiter
    res = []
    for ax, q_i in zip(axes.flatten(), range(69, 75)):  # R: 70 à 75 inclus
        # Filtrer les données pour la question actuelle
        df = m1[m1['id'] == q[q_i]].sort_values('res0')[['res0', 'score_avg']]

        # Étape 2 : Diviser les données en deux groupes (observé et non observé)
        indices = np.arange(len(df))
        i = indices % 2 == 0  # Lignes paires comme observées

        # Étape 3 : Appliquer le lissage sur les scores observés
        observed_res0 = df.loc[i, 'res0'].values+1e-9
        observed_scores = df.loc[i, 'score_avg'].values
        smoothed_scores = observed_scores#gaussian_filter(observed_scores, sigma=window)

        # Étape 4 : Tracer les données
        ax.plot(df.loc[~i, 'res0'], df.loc[~i, 'score_avg'], 'o-', color='blue', label='Scores bruts non observés')
        ax.plot(observed_res0, smoothed_scores, 'o-', color='red', label='Scores lissés observés')

        # Étape 5 : Interpoler pour les scores non observés
        predicted_scores  = np.interp(df.loc[~i, 'res0'].values, observed_res0, smoothed_scores)
        # predicted_scores = interpolation(d+np.random.normal(0, 0.01, df.loc[~i, 'res0'].values.shape[0]))

        # Ajouter les scores prédits au graphique
        ax.plot(df.loc[~i, 'res0'], predicted_scores, 'o-', color='green', label='Scores prédits')
        ax.set_ylim(-0.2, 5.2)
        ax.set_xlim(-.10, 1.10)
        ax.set_title(f'Question {q[q_i]}')
        ax.set_xlabel('Similarité')
        ax.set_ylabel('Score')
        ax.legend()

        # Étape 6 : Calculer la corrélation entre les scores prédits et les scores réels non observés
        corr = np.corrcoef(predicted_scores, df.loc[~i, 'score_avg'].values)[0, 1]
        res.append(corr)

    plt.show()
    # Final score
    print(res)


In [None]:
for model_name in models_students_scores:
    print(f"\nCalcul de l'erreur avec {model_name}")
    # Étape 1 : Créer m1 en supprimant la colonne 'desired_answer' et en ajoutant res0
    m1 = m0.drop(columns=['desired_answer'])  # Supprimer 'desired_answer'
    m1.insert(m1.columns.get_loc('question'), 'res0', models_students_scores[model_name])  # Ajouter 'res0' avant 'question'

    q = m1['id'].unique()  # Identifiants uniques des questions

    # Étape 1 : Initialiser les résultats pour chaque fenêtre
    window_values = np.arange(0.1, 4.1, 0.1)  # Équivalent de seq(.1, 2, .1)
    res3 = []

    for window in window_values:
        # Étape 2 : Calculer le RMSE pour chaque question
        rmse_per_question = []
        for q_i in range(len(q)):
            # Filtrer les données pour la question actuelle
            df = m1[m1['id'] == q[q_i]].sort_values('res0')[['res0', 'score_avg']]

            # Diviser les données en observées et non observées
            indices = np.arange(len(df))
            i = indices % 2 == 0  # Observées : indices pairs

            # Lissage des scores observés
            observed_res0 = df.loc[i, 'res0'].values
            observed_scores = df.loc[i, 'score_avg'].values
            smoothed_scores = gaussian_filter(observed_scores, sigma=window)

            # Interpolation pour les scores non observés
            
            predicted_scores = np.interp(df.loc[~i, 'res0'].values, observed_res0, smoothed_scores)

            actual_scores = df.loc[~i, 'score_avg'].values
            # Calculer le RMSE pour cette question
            rmse_v = rmse(predicted_scores, actual_scores)
            rmse_per_question.append(rmse_v)

        # Stocker les RMSE pour toutes les questions pour cette fenêtre
        res3.append(rmse_per_question)

    # Étape 3 : Convertir en matrice numpy
    res3 = np.array(res3)

    # Étape 4 : Calculer les moyennes des RMSE pour chaque fenêtre
    mean_rmse_per_window = res3.mean(axis=1)

    # Étape 5 : Associer les fenêtres et leurs RMSE moyens
    result = np.vstack((window_values, mean_rmse_per_window))

    display(pd.DataFrame(data=[['Valeurs des fenêtres (window)', *result[0]], ['RMSE moyens', *result[1]]]))

    # Trouver la fenêtre avec le plus faible RMSE
    optimal_window = window_values[np.argmin(mean_rmse_per_window)]
    print(f"Fenêtre optimale : {optimal_window:.1f}")
    print(f"RMSE Min: {np.min(result[1])}")


In [None]:
windows = {'llama': optimal_window} #Replace with best llama correlation
for model_name in models_students_scores:
    print(f"\nCorrelation evaluation with {model_name}")

    # Étape 1 : Créer m1 en supprimant la colonne 'desired_answer' et en ajoutant res0
    m1 = m0.drop(columns=['desired_answer'])  # Supprimer 'desired_answer'
    m1.insert(m1.columns.get_loc('question'), 'res0', models_students_scores[model_name])  # Ajouter 'res0' avant 'question'

    q = m1['id'].unique()  # Identifiants uniques des questions

    # Étape 2 : Calculer les prédictions et collecter les résultats
    res = []
    for quest in q:
        # Filtrer les données pour la question actuelle
        df = m1[m1['id'] == quest].sort_values('res0')[['res0', 'score_avg']]

        # Diviser les données en observés et non observés
        indices = np.arange(len(df))
        i = indices % 2 == 0  # Observés : indices pairs

        # Lissage des scores observés
        observed_res0 = df.loc[i, 'res0'].values
        observed_scores = df.loc[i, 'score_avg'].values
        smoothed_scores = gaussian_filter(observed_scores, sigma=windows[model_name])

        # Interpolation pour prédire les scores non observés
        predicted_scores = np.interp(df.loc[~i, 'res0'].values, observed_res0, smoothed_scores)
        
        # Stocker les résultats
        res.append({
            'pred': predicted_scores,
            'observed': df.loc[~i, 'score_avg'].values,
            'q': np.repeat(quest, len(predicted_scores)),
            'i': np.where(~i)[0]
        })

    # Étape 3 : Extraire les scores prédits et observés
    res_pred = np.concatenate([r['pred'] for r in res])
    res_observed = np.concatenate([r['observed'] for r in res])

    # Résultats finaux
    # print("Scores prédits :", res_pred)
    # print("Scores observés :", res_observed)

    # Calcul de la corrélation entre les scores prédits et observés
    correlation, _ = pearsonr(res_pred, res_observed)

    print(f"Corrélation entre les scores prédits et observés : {correlation:.3f}")


In [None]:
def choosen_index(size: int, number_of_items_to_select=15):
    return np.isin(
        np.arange(size), 
        np.random.choice(size, size=number_of_items_to_select, replace=False)
    )

### 15 Choisies aleatoirement

In [None]:

for model_name in models_students_scores:
    print(f"\nCalcul de l'erreur avec {model_name}")
    # Étape 1 : Créer m1 en supprimant la colonne 'desired_answer' et en ajoutant res0
    m1 = m0.drop(columns=['desired_answer'])  # Supprimer 'desired_answer'
    m1.insert(m1.columns.get_loc('question'), 'res0', models_students_scores[model_name])  # Ajouter 'res0' avant 'question'

    q = m1['id'].unique()  # Identifiants uniques des questions

    # Étape 1 : Initialiser les résultats pour chaque fenêtre
    window_values = np.arange(0.1, 4.1, 0.1)  # Équivalent de seq(.1, 2, .1)
    res3 = []
    res4 = []

    for window in window_values:
        # Étape 2 : Calculer le RMSE pour chaque question
        rmse_per_question = []
        correlation_per_question = []
        for q_i in range(len(q)):
            # Filtrer les données pour la question actuelle
            df = m1[m1['id'] == q[q_i]].sort_values('res0')[['res0', 'score_avg']]

            # Diviser les données en observées et non observées
            i = choosen_index(len(df), 15)

            # Lissage des scores observés
            observed_res0 = df.loc[i, 'res0'].values
            observed_scores = df.loc[i, 'score_avg'].values
            smoothed_scores = gaussian_filter(observed_scores, sigma=window)

            # Interpolation pour les scores non observés
            predicted_scores = np.interp(df.loc[~i, 'res0'].values , observed_res0, smoothed_scores)
            
            actual_scores = df.loc[~i, 'score_avg'].values 
            # Calcul de la corrélation entre les scores prédits et observés
            # correlation, _ = pearsonr(actual_scores, actual_scores)
            # correlation_per_question.append(correlation)

            # Calculer le RMSE pour cette question
            rmse_ve = rmse(predicted_scores, actual_scores)
            rmse_per_question.append(rmse_ve)

        # Stocker les RMSE pour toutes les questions pour cette fenêtre
        res3.append(rmse_per_question)
        res4.append(correlation_per_question)

    # Étape 3 : Convertir en matrice numpy
    res3 = np.array(res3)
    res4 = np.array(res4)

    # Étape 4 : Calculer les moyennes des RMSE pour chaque fenêtre
    mean_rmse_per_window = res3.mean(axis=1)

    # mean_correlation_per_window = res4.mean(axis=1)

    # Étape 5 : Associer les fenêtres et leurs RMSE moyens
    result = np.vstack((window_values, mean_rmse_per_window))

    display(pd.DataFrame(data=[['Valeurs des fenêtres (window)', *result[0]], ['RMSE moyens', *result[1]]]))

    # Trouver la fenêtre avec le plus faible RMSE
    optimal_window = window_values[np.argmin(mean_rmse_per_window)]
    # corres_correlation = mean_correlation_per_window[np.argmin(mean_rmse_per_window)]
    print(f"Fenêtre optimale : {optimal_window:.1f}")
    # print(f"Correlation : {corres_correlation:.1f}")
    print(f"RMSE Min: {np.min(result[1])}")


### Leave one out

In [None]:

for model_name in models_students_scores:
    print(f"\nCalcul de l'erreur avec {model_name}")
    # Étape 1 : Créer m1 en supprimant la colonne 'desired_answer' et en ajoutant res0
    m1 = m0.drop(columns=['desired_answer'])  # Supprimer 'desired_answer'
    m1.insert(m1.columns.get_loc('question'), 'res0', models_students_scores[model_name])  # Ajouter 'res0' avant 'question'

    q = m1['id'].unique()  # Identifiants uniques des questions

    # Étape 1 : Initialiser les résultats pour chaque fenêtre
    window_values = np.arange(0.1, 4.1, 0.1)  # Équivalent de seq(.1, 2, .1)
    res3 = []
    res4 = []

    for window in window_values:
        # Étape 2 : Calculer le RMSE pour chaque question
        rmse_per_question = []
        correlation_per_question = []
        for q_i in range(len(q)):
            # Filtrer les données pour la question actuelle
            df = m1[m1['id'] == q[q_i]].sort_values('res0')[['res0', 'score_avg']]

            # Diviser les données en observées et non observées
            i = choosen_index(len(df), len(df)-1)

            # Lissage des scores observés
            observed_res0 = df.loc[i, 'res0'].values
            observed_scores = df.loc[i, 'score_avg'].values
            smoothed_scores = gaussian_filter(observed_scores, sigma=window)

            # Interpolation pour les scores non observés
            predicted_scores = np.interp(df.loc[~i, 'res0'].values, observed_res0, smoothed_scores)
            # predicted_scores = interpolation(df.loc[~i, 'res0'].values)

            # Calculer le RMSE pour cette question
            actual_scores = df.loc[~i, 'score_avg'].values
            rmse = np.sqrt(np.mean((predicted_scores - actual_scores) ** 2))
            rmse_per_question.append(rmse)
            # Calcul de la corrélation entre les scores prédits et observés
            # correlation, _ = pearsonr(predicted_scores, actual_scores)
            # correlation_per_question.append(correlation)

        # Stocker les RMSE pour toutes les questions pour cette fenêtre
        res3.append(rmse_per_question)
        # res4.append(correlation_per_question)

    # Étape 3 : Convertir en matrice numpy
    res3 = np.array(res3)
    res4 = np.array(res4)

    # Étape 4 : Calculer les moyennes des RMSE pour chaque fenêtre
    mean_rmse_per_window = res3.mean(axis=1)

    # mean_correlation_per_window = res4.mean(axis=1)

    # Étape 5 : Associer les fenêtres et leurs RMSE moyens
    result = np.vstack((window_values, mean_rmse_per_window))

    display(pd.DataFrame(data=[['Valeurs des fenêtres (window)', *result[0]], ['RMSE moyens', *result[1]]]))

    # Trouver la fenêtre avec le plus faible RMSE
    optimal_window = window_values[np.argmin(mean_rmse_per_window)]
    # corres_correlation = mean_correlation_per_window[np.argmin(mean_rmse_per_window)]
    print(f"Fenêtre optimale : {optimal_window:.1f}")
    # print(f"Correlation : {corres_correlation:.1f}")
    print(f"RMSE Min: {np.min(result[1])}")


In [None]:
windows = {'llama':optimal_window} #The best window previuously find
for model_name in models_students_scores:
    print(f"\nCorrelation evaluation with {model_name}")

    # Étape 1 : Créer m1 en supprimant la colonne 'desired_answer' et en ajoutant res0
    m1 = m0.drop(columns=['desired_answer'])  # Supprimer 'desired_answer'
    m1.insert(m1.columns.get_loc('question'), 'res0', models_students_scores[model_name])  # Ajouter 'res0' avant 'question'

    q = m1['id'].unique()  # Identifiants uniques des questions

    # Étape 2 : Calculer les prédictions et collecter les résultats
    res = []
    for quest in q:
        # Filtrer les données pour la question actuelle
        df = m1[m1['id'] == quest].sort_values('res0')[['res0', 'score_avg']]

        # Diviser les données en observés et non observés
        i = choosen_index(len(df), len(df)-15)

        # Lissage des scores observés
        observed_res0 = df.loc[i, 'res0'].values
        observed_scores = df.loc[i, 'score_avg'].values
        smoothed_scores = gaussian_filter(observed_scores, sigma=windows[model_name])

        # Interpolation pour prédire les scores non observés
        predicted_scores = np.interp(df.loc[~i, 'res0'].values, observed_res0, smoothed_scores)
        
        # Stocker les résultats
        res.append({
            'pred': predicted_scores,
            'observed': df.loc[~i, 'score_avg'].values,
            'q': np.repeat(quest, len(predicted_scores)),
            'i': np.where(~i)[0]
        })

    # Étape 3 : Extraire les scores prédits et observés
    res_pred = np.concatenate([r['pred'] for r in res])
    res_observed = np.concatenate([r['observed'] for r in res])

    # Calcul de la corrélation entre les scores prédits et observés
    correlation, _ = pearsonr(res_pred, res_observed)

    print(f"Corrélation entre les scores prédits et observés : {correlation:.3f}")
