## Importo librerie e mi connetto al Drive

In [1]:
!pip install transformers
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    TFAutoModel,
    TFAutoModelWithLMHead
)
from google.colab import drive
from google.colab import files
import json

drive.mount('/content/gdrive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

## Importo il codice delle librerie necessario per SPP

In [None]:
# IMPORT FILES
# tutti gli import vari

# r2de.utils.math
import numpy as np

DISCRIMINATION_COEFFICIENT = 1.7


def item_response_function(difficulty, skill, discrimination, guess, slip) -> float:
    """
    Computes the logistic function for the given arguments and returns a float. The initial np.product is necessary for
    the multidimensional case.
    """
    return np.product(
        np.add(
            guess,
            np.divide(
                1.0 - np.add(guess, slip),
                1.0 + np.exp(-DISCRIMINATION_COEFFICIENT * np.multiply(discrimination, np.subtract(skill, difficulty)))
            )
        )
    )


def inverse_item_response_function(difficulty, skill, discrimination, guess, slip) -> float:
    """
    Computes 1 - logistic function for the given arguments and returns a float.
    """
    return 1.0 - item_response_function(difficulty, skill, discrimination, guess, slip)


def information_function(b, theta, discrimination, guess=0, slip=0) -> float:
    """
    Information function of a question: I(theta) = (P'(theta))**2/(P(theta)*Q(theta)), where Q(theta) = 1 - P(theta)
    """
    return np.divide(
        np.square(derivative_item_response_function(b, theta, discrimination, guess, slip)),
        (
                item_response_function(b, theta, discrimination, guess, slip)
                * inverse_item_response_function(b, theta, discrimination, guess, slip)
        )
    )


def derivative_item_response_function(b, theta, discrimination, guess, slip) -> float:
    """
    Computes the derivative of the item_response_function.
    """
    x = np.exp(-DISCRIMINATION_COEFFICIENT * discrimination * (theta[0]-b[0]))
    return np.divide((1.-guess-slip) * x * (-DISCRIMINATION_COEFFICIENT) * discrimination, np.square(1 + x**2))


# data folder
DATA_PATH = 'data/'

# filenames
ANSWERS_TEXT_FILENAME = 'answers_texts.csv'
DETAILED_QS_ANSWERS_FILENAME = 'detailed_quiz_session_answer.csv'
DS_GTE_FILENAME = 'DS_GTE.csv'
DS_VAL_FILENAME = 'DS_VAL.csv'
QUESTION_COUNT_FILENAME = 'questions_counts.csv'

# headers
CORRECT_HEADER = 'IsCorrect'
COUNT_HEADER = 'count'
DESCRIPTION_HEADER = 'description'
DIFFICULTY_KEY = 'difficulty'
DISCRIMINATION_KEY = 'discrimination'
FEATURES_HEADER = 'features'
ID_HEADER = 'id'
QUESTION_ID_HEADER = 'QuestionId'
QUESTION_TEXT_HEADER = 'question_text'
TARGET_DIFFICULTY_HEADER = 'target_difficulty'
TARGET_DISCRIMINATION_HEADER = 'target_discrimination'
#TIMESTAMP_HEADER = 'time_stamp'
TIMESTAMP_HEADER = 'AnswerId'
USER_ID_HEADER = 'UserId'

# values used in the IRT estimation
DIFFICULTY_MIN = -5.0
DIFFICULTY_MAX = 5.0
DEFAULT_DISCRIMINATION = 1.0
DISCRIMINATION_MIN = -1.0
DISCRIMINATION_MAX = 2.5
DEFAULT_GUESS = 0.0
DEFAULT_SLIP = 0.0

DETAILED_QS_ANSWERS_COLUMNS = [
    USER_ID_HEADER,
    TIMESTAMP_HEADER,
    CORRECT_HEADER,
    QUESTION_ID_HEADER,
    QUESTION_TEXT_HEADER,
]
QUESTION_COUNT_COLUMNS = [QUESTION_ID_HEADER, COUNT_HEADER]
ANSWERS_TEXT_COLUMNS = [CORRECT_HEADER, DESCRIPTION_HEADER, ID_HEADER, QUESTION_ID_HEADER]

from sklearn.metrics import (
    confusion_matrix,
    mean_absolute_error,
    mean_squared_error
)
def gen_output(predicted_results, true_res):  # this function is used only in the performance prediction script
    tn, fp, fn, tp = confusion_matrix(true_res, predicted_results).ravel()
    output_str = ''
    output_str += 'acc : %.3f | ' % ((tp+tn)/(tp+tn+fp+fn))
    output_str += 'prec correct : %.3f | ' % (tp/(tp+fp))
    output_str += 'rec correct : %.3f | ' % (tp/(tp+fn))
    output_str += 'prec wrong : %.3f | ' % (tn/(tn+fn))
    output_str += 'rec wrong: %.3f ' % (tn/(tn+fp))
    return output_str

## Funzioni per calcolo SPP

In [None]:
import numpy as np
import pandas as pd


def perform_user_irt_prediction(
        interactions_df: pd.DataFrame,
        difficulty_dict: dict,
        discrimination_dict: dict,
        difficulty_range: (DIFFICULTY_MIN, DIFFICULTY_MAX),
        theta_increment=0.1,
        initial_theta=(DIFFICULTY_MAX+DIFFICULTY_MIN)/2,
        guess=DEFAULT_GUESS,
        slip=DEFAULT_SLIP,
) -> list:
    """
    :param interactions_df: dataframe containing all the interactions between users and items
    :param difficulty_dict: dictionary containing the difficulty of each item
    :param discrimination_dict: dictionary containing the discrimination of each item
    :param difficulty_range: tuple containing min and max difficulty
    :param theta_increment: the granularity of the skill level we are interested in
    :param initial_theta: starting skill level for the estimation
    :param guess: guess factor to use in the IRT model
    :param slip: slip factor to use in the IRT model
    :return: the list containing the predicted results for the interactions in the input dataframe
    """
    predicted_result = []
    estimated_theta = [initial_theta]
    thetas = np.arange(difficulty_range[0], difficulty_range[1] + theta_increment, theta_increment)
    log_likelihood = np.zeros(len(thetas), dtype=float)
    information_func = np.zeros(len(thetas), dtype=float)
    list_loglikelihood = np.zeros(len(thetas), dtype=object)
    list_information_function = np.zeros(len(thetas), dtype=object)
    for idx, theta in enumerate(thetas):
        list_loglikelihood[idx] = []
        list_information_function[idx] = []

    for true_result, item_id in interactions_df[[CORRECT_HEADER, QUESTION_ID_HEADER]].values:
        if item_id in difficulty_dict.keys() and item_id in discrimination_dict.keys():
            difficulty = [difficulty_dict[item_id]]
            discrimination = discrimination_dict[item_id]
        else:
            difficulty = [(DIFFICULTY_MAX+DIFFICULTY_MIN)/2]
            discrimination = DEFAULT_DISCRIMINATION
            print("[INFO] Question with ID %s was not known. Manually set latent traits" % item_id)

        predicted_result.append(item_response_function(difficulty, estimated_theta, discrimination, guess, slip))

        func = item_response_function if true_result == 1 else inverse_item_response_function
        for idx, theta in enumerate(thetas):
            item_log_likelihood = np.log(func(difficulty, [theta], discrimination, guess, slip))
            list_loglikelihood[idx].append(item_log_likelihood)
            log_likelihood[idx] = np.sum(list_loglikelihood[idx])

            item_information = information_function(difficulty, [theta], discrimination, guess, slip)
            list_information_function[idx].append(item_information)
            information_func[idx] = np.sum(list_information_function[idx])

        estimated_theta = [thetas[np.argmax(log_likelihood)]]

    return predicted_result


def irt_prediction_with_update(
        interactions_df: pd.DataFrame,
        difficulty_dict: dict,
        discrimination_dict: dict,
        user_id_list: list,
        difficulty_range=(DIFFICULTY_MIN, DIFFICULTY_MAX),
        theta_increment=0.1,
        initial_theta=(DIFFICULTY_MAX + DIFFICULTY_MIN) / 2,
        guess=DEFAULT_GUESS,
        slip=DEFAULT_SLIP,
) -> list:
    """
    :param interactions_df: dataframe containing all the interactions between users and items
    :param difficulty_dict: dictionary containing the difficulty of each item
    :param discrimination_dict: dictionary containing the discrimination of each item
    :param user_id_list:
    :param difficulty_range: tuple containing min and max difficulty
    :param theta_increment: the granularity of the skill level we are interested in
    :param initial_theta: starting skill level for the estimation
    :param guess: guess factor to use in the IRT model
    :param slip: slip factor to use in the IRT model
    :return: the list containing the predicted results for all the interactions and students in the input dataframe
    """
    predicted_result = []
    for user_id in user_id_list:  # performance prediction is done for all the students, one at a time
        predicted_result.extend(
            perform_user_irt_prediction(
                interactions_df=interactions_df[interactions_df[USER_ID_HEADER] == user_id],
                difficulty_dict=difficulty_dict,
                discrimination_dict=discrimination_dict,
                difficulty_range=difficulty_range,
                theta_increment=theta_increment,
                initial_theta=initial_theta,
                guess=guess,
                slip=slip,
            )
        )
    return predicted_result

## Calcolo del SPP

In [None]:
"""
Given the ground truth latent traits and the values estimated with the best performing model, this script performs the
evaluation of the accuracy on the performance prediction task. As described in the paper, these methods for predicting
the performance are used:
- ground truth IRT latent traits
- default latent traits (difficulty=0, discrimination=1)
- latent traits of test questions estimated with R2DE and default latent traits for train questions
- latent traits of test questions estimated with R2DE and IRT latent traits for train questions (this is the real-world
    scenario, as the new questions will be used to assess students together with previously existing - and therefore
    calibrated - questions)
Results are saved in the output file specified at the beginning of the script.
"""
import pandas as pd
import pickle


output_filename = '/content/gdrive/MyDrive/Thesis/data/utilities/output-performance-prediction_split50.txt'
filename = '/content/gdrive/MyDrive/Thesis/data/utilities/neurIPS_edu_competition_text_split_numbers.csv'
file = open(output_filename, 'w')


# get the dataset to perform the prediction on and sort it
df_VAL = pd.read_csv('/content/gdrive/MyDrive/Thesis/data/utilities/train_task_numbers50.csv').sort_values([USER_ID_HEADER, 'AnswerId'], ascending=True)


df = pd.read_csv(filename)
irt_diff_dict = df.set_index('id')['difficulty'].to_dict()

print(irt_diff_dict)

list_question_ids = list(df_VAL['QuestionId'].unique())

irt_diff_dict_copy = irt_diff_dict.copy()
for elem in irt_diff_dict_copy:
  if elem not in list_question_ids:
    del irt_diff_dict[elem]

#list_question_ids = [i for i in range(0, 948)]

# create the "default" dictionary
default_diff_dict = {q_id: (DIFFICULTY_MAX+DIFFICULTY_MIN)/2 for q_id in list_question_ids}
default_discr_dict = {q_id: DEFAULT_DISCRIMINATION for q_id in list_question_ids}

# collect the list of users
user_id_list = list(df_VAL[USER_ID_HEADER].unique())

# collect the true results
true_results = df_VAL[CORRECT_HEADER].values

print("[INFO] Prediction with IRT estimated latent traits...")
irt_predicted_results = irt_prediction_with_update(df_VAL, irt_diff_dict, default_discr_dict, user_id_list)
pickle.dump(irt_predicted_results, open('/content/gdrive/MyDrive/Thesis/data/utilities/performance-prediction-irt_split.p', 'wb'))
print("[INFO] Done")


print("[INFO] Below, the results of performance prediction:")
irt_predicted_results = [x >= 0.5 for x in irt_predicted_results]


output_string = 'IRT estimated latent traits: '
output_string += gen_output(irt_predicted_results, true_results)
print(output_string)
file.write(output_string)

file.close()

{653: -0.3973178869022932, 149: 1.2761733800356294, 640: 0.8204619541008116, 352: 1.174900247438427, 531: 1.0039448131651298, 597: 0.2754523511515671, 924: -2.600702985409529, 813: -0.802307899314151, 322: 0.6090414416215015, 521: -0.0934783065022318, 158: 0.7935315152819272, 847: -1.399186725270489, 1: 0.2653064901643404, 443: 0.4295764646618265, 831: -0.5055975332077302, 117: 1.975239533990934, 585: -0.3913767362174725, 208: 1.7775653176350674, 733: -1.7922209414719772, 788: -0.5510044719953365, 190: -0.4562498186542816, 99: 0.3197944227509171, 793: 1.0843733828601458, 692: 0.0815462392114802, 633: 0.2598950691022788, 419: 0.8490615330311113, 747: -0.2040076762730907, 544: -0.0565784853213766, 893: 0.2654857530750677, 145: 0.4002651149363758, 219: 0.4988290809497429, 426: 1.0179123120026714, 374: -0.2814351283455658, 213: -0.2503493781365937, 888: 0.986020034604196, 881: -1.482128623712864, 347: -0.6813687680955439, 789: 0.9494760410894376, 628: 1.126505210221935, 742: -1.33724869670

## Data Exploration on Logs File

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/Thesis/data/utilities/train_task_3_4.csv')

print(df.head())


unique_user_ids = df['UserId'].nunique()
print("\nNumber of unique UserId: ", unique_user_ids)


unique_ans_ids = df['AnswerId'].nunique()
print("\nNumber of unique Answer: ", unique_ans_ids)

unique_question_ids = df['QuestionId'].nunique()
print("\nNumber of unique QuestionId: ", unique_question_ids)

question_counts = df.groupby('UserId')['QuestionId'].count()

import matplotlib.pyplot as plt
# Plotting the distribution
plt.figure(figsize=(10, 6))
plt.hist(question_counts, bins=20, edgecolor='black')
plt.xlabel('Number of Questions')
plt.ylabel('Number of Users')
plt.title('Distribution of Number of Questions per User')
plt.show()

# Print the minimum number of questions per user
min_questions = question_counts.min()
print("Minimum number of questions per user:", min_questions)


answer_counts = df.groupby('UserId')['AnswerId'].count()


# Plotting the distribution
plt.figure(figsize=(10, 6))
plt.hist(question_counts, bins=20, edgecolor='black')
plt.xlabel('Number of Answers')
plt.ylabel('Number of Users')
plt.title('Distribution of Number of Answers per User')
plt.show()

# Print the minimum number of questions per user
min_questions = question_counts.min()
print("Minimum number of Answers per user:", min_questions)

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/Thesis/data/utilities/train_task_3_4.csv')

print(df.head())


user_counts = df.groupby('QuestionId')['UserId'].nunique()

# Plotting the distribution
plt.figure(figsize=(10, 6))
plt.hist(user_counts, bins=100, edgecolor='black')
plt.xlabel('Number of Users')
plt.ylabel('Number of Questions')
plt.title('Distribution of Number of Users per Question')
plt.show()

# Print the minimum number of questions per user
min_users = user_counts.min()
print("Minimum number of user per question:", min_users)

avg_users_per_question = df.groupby('QuestionId')['UserId'].nunique().mean()
print("Average number of users per question:", int(avg_users_per_question))

question_counts = df.groupby('QuestionId')['UserId'].nunique()
questions_less_than_10_users = (question_counts < 5).sum()
print("Number of questions with less than 5 users:", questions_less_than_10_users)

question_counts = df.groupby('QuestionId')['UserId'].nunique()
questions_less_than_10_users = (question_counts < 10).sum()
print("Number of questions with less than 10 users:", questions_less_than_10_users)

question_counts = df.groupby('QuestionId')['UserId'].nunique()
questions_less_than_10_users = (question_counts < 30).sum()
print("Number of questions with less than 30 users:", questions_less_than_10_users)


question_counts = df.groupby('QuestionId')['UserId'].nunique()
questions_less_than_10_users = (question_counts < 50).sum()
print("Number of questions with less than 50 users:", questions_less_than_10_users)

In [None]:
question_counts = df.groupby('QuestionId')['UserId'].transform('nunique')
df = df[question_counts >= 50]

print("Filtered DataFrame:")
print(df)

question_counts = df.groupby('QuestionId')['UserId'].nunique()
questions_less_than_10_users = (question_counts < 5).sum()
print("Number of questions with less than 5 users:", questions_less_than_10_users)

question_counts = df.groupby('QuestionId')['UserId'].nunique()
questions_less_than_10_users = (question_counts < 10).sum()
print("Number of questions with less than 10 users:", questions_less_than_10_users)

question_counts = df.groupby('QuestionId')['UserId'].nunique()
questions_less_than_10_users = (question_counts < 30).sum()
print("Number of questions with less than 30 users:", questions_less_than_10_users)


question_counts = df.groupby('QuestionId')['UserId'].nunique()
questions_less_than_10_users = (question_counts < 50).sum()
print("Number of questions with less than 50 users:", questions_less_than_10_users)

unique_user_ids = df['UserId'].nunique()
print("\nNumber of unique UserId: ", unique_user_ids)


unique_ans_ids = df['AnswerId'].nunique()
print("\nNumber of unique Answer: ", unique_ans_ids)

unique_question_ids = df['QuestionId'].nunique()
print("\nNumber of unique QuestionId: ", unique_question_ids)

df.to_csv('/content/gdrive/MyDrive/Thesis/data/utilities/train_task_50+.csv')

In [3]:
algebra_interactions = pd.read_csv('/content/gdrive/MyDrive/Thesis/data/utilities/train_task.csv')
print(algebra_interactions.shape)
algebra_interactions_pre = pd.read_csv('/content/gdrive/MyDrive/Thesis/data/utilities/train_task_50+.csv')
print(algebra_interactions_pre.shape)
#print(algebra_df[0]==algebra_df_pre[0])

# Count the number of distinct QuestionId values
distinct_count = algebra_interactions['QuestionId'].nunique()
print(distinct_count)

distinct_count = algebra_interactions_pre['QuestionId'].nunique()
print(distinct_count)

(1382727, 6)
(1382362, 7)
948
900
