In [None]:
# install all the libraries 
!sudo apt-get install tesseract-ocr

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 19 not upgraded.
Need to get 4,850 kB of archives.
After this operation, 16.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1 [1,598 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr amd64 4.1.1-2build2 [262 kB]
Fetched 4,850 kB in 2s (2,745 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/Fro

In [None]:
!pip install pytesseract==0.3.9

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytesseract==0.3.9
  Downloading pytesseract-0.3.9-py2.py3-none-any.whl (14 kB)
Collecting Pillow>=8.0.0
  Downloading Pillow-9.4.0-cp38-cp38-manylinux_2_28_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Pillow, pytesseract
  Attempting uninstall: Pillow
    Found existing installation: Pillow 7.1.2
    Uninstalling Pillow-7.1.2:
      Successfully uninstalled Pillow-7.1.2
Successfully installed Pillow-9.4.0 pytesseract-0.3.9


In [None]:
!pip install pyirt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyirt
  Downloading pyirt-0.3.4-py3-none-any.whl (16 kB)
Collecting python-decouple
  Downloading python_decouple-3.7-py3-none-any.whl (9.9 kB)
Installing collected packages: python-decouple, pyirt
Successfully installed pyirt-0.3.4 python-decouple-3.7


In [None]:
# Import all libraries
import cv2
import pytesseract
from google.colab import drive
from google.colab import files
import os
import json

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
def get_text(image_directory, difficulties):
  dataset = []
  for img_name in os.listdir(image_directory):
    print(os.path.splitext(img_name), int(os.path.splitext(img_name)[0]))
    filename = os.path.join(image_directory, img_name)
    # Load the image
    img = cv2.imread(filename)

    # Apply OCR (Optical Character Recognition) on the image
    text = pytesseract.image_to_string(img)

    # Preprocess the text keeping just the questions
    text = text.split('?')[0] + '?'
    text = text.replace('\n', ' ')
    text = text.replace('  ', ' ')

    id = int(os.path.splitext(img_name)[0])
    element = {'id': id, 'question': text, 'difficulty': difficulties[id]}
    dataset.append(element)


  # convert into json
  # file name is mydata
  with open("/content/gdrive/MyDrive/Thesis/data/difficulties.json", "w") as final:
    json.dump(dataset, final, indent=4)
  
  # download the json file
  files.download('/content/gdrive/MyDrive/Thesis/data/difficulties.json')

  
  return

In [None]:
import pandas as pd
from pyirt import irt



DIFFICULTY_MIN = -5.0
DIFFICULTY_MAX = 5.0
DEFAULT_DISCRIMINATION = 1.0
DEFAULT_GUESS = 0.0
DIFFICULTY_KEY = 'difficulty'
DISCRIMINATION_KEY = 'discrimination'
USER_ID_HEADER = 'UserId'
CORRECT_HEADER = 'IsCorrect'
QUESTION_ID_HEADER = 'QuestionId'



def question_irt_estimation(
        interactions_df: pd.DataFrame,
        difficulty_range=(DIFFICULTY_MIN, DIFFICULTY_MAX),
        discrimination_range=(DEFAULT_DISCRIMINATION, DEFAULT_DISCRIMINATION),
        guess=DEFAULT_GUESS
) -> dict:
    """
    Calls the method for IRT estimation and returns only the dictionary containing the difficulty of the questions.
    :param interactions_df:
    :param difficulty_range:
    :param discrimination_range:
    :param guess:
    :return:
    """
    d = irt_estimation(interactions_df, difficulty_range, discrimination_range, guess)[1]['difficulty']
    print(d)
    data = {int(k):v for k,v in d.items()}
    # convert into json
    # file name is mydata
    with open("/content/gdrive/MyDrive/Thesis/data/q_diff.json", "w") as final:
      json.dump(data, final, indent=2)
  
    # download the json file
    files.download('/content/gdrive/MyDrive/Thesis/data/q_diff.json')
    return d


def irt_estimation(
        interactions_df: pd.DataFrame,
        difficulty_range=(DIFFICULTY_MIN, DIFFICULTY_MAX),
        discrimination_range=(DEFAULT_DISCRIMINATION, DEFAULT_DISCRIMINATION),
        guess=DEFAULT_GUESS
) -> (dict, dict):
    """
    Given the input interactions between a set of students and a set of questions, performs with the irt method from
    pyirt the IRT estimation of the latent traits of students and questions. It returns the dictionaries mapping
    from the studentID or itemID to the corresponding latent traits.
    """
    interactions_list = [
        (user, item, correctness)
        for user, item, correctness in interactions_df[[USER_ID_HEADER, QUESTION_ID_HEADER, CORRECT_HEADER]].values
    ]
    # if there are some items with only correct or only wrong answers, pyirt crashes
    question_count_per_correctness = interactions_df.groupby([QUESTION_ID_HEADER, CORRECT_HEADER])\
        .size().reset_index().groupby(QUESTION_ID_HEADER).size().reset_index().rename(columns={0: 'cnt'})
    list_q_to_add = list(question_count_per_correctness[question_count_per_correctness['cnt'] == 1][QUESTION_ID_HEADER])
    print('[INFO] %d questions filled in' % len(list_q_to_add))
    interactions_list.extend([('p_good', itemID, True) for itemID in list_q_to_add])
    interactions_list.extend([('p_bad', itemID, False) for itemID in list_q_to_add])

    try:
        item_params, user_params = irt(
            interactions_list,
            theta_bnds=difficulty_range,
            beta_bnds=difficulty_range,
            alpha_bnds=discrimination_range,
            in_guess_param={q: guess for q in interactions_df[QUESTION_ID_HEADER].unique()},
            max_iter=100
        )
    except Exception:
        raise ValueError("Problem in irt_estimation. Check if there are items with only correct/wrong answers.")
    question_dict = dict()
    question_dict[DIFFICULTY_KEY] = dict()
    question_dict[DISCRIMINATION_KEY] = dict()
    for question, question_params in item_params.items():
        question_dict[DIFFICULTY_KEY][question] = -question_params['beta']
        question_dict[DISCRIMINATION_KEY][question] = question_params["alpha"]
    user_dict = {x[0]: x[1] for x in user_params.items()}
    return user_dict, question_dict

In [None]:
interactions_df = pd.read_csv('/content/gdrive/MyDrive/Thesis/data/train_task.csv')
difficulties = question_irt_estimation(interactions_df)
#print(difficulties)
image_directory = '/content/gdrive/MyDrive/Thesis/data/images'
get_text(image_directory, difficulties)

[INFO] 1 questions filled in


100%|██████████| 4920/4920 [00:14<00:00, 345.91it/s]
100%|██████████| 948/948 [00:00<00:00, 2440.66it/s]
100%|██████████| 948/948 [00:01<00:00, 532.62it/s]
100%|██████████| 4920/4920 [00:02<00:00, 1945.10it/s]
100%|██████████| 4920/4920 [00:15<00:00, 320.97it/s]
100%|██████████| 948/948 [00:00<00:00, 2430.97it/s]
100%|██████████| 948/948 [00:01<00:00, 801.13it/s]
100%|██████████| 4920/4920 [00:01<00:00, 3264.20it/s]
100%|██████████| 4920/4920 [00:14<00:00, 335.73it/s]
100%|██████████| 948/948 [00:00<00:00, 2562.41it/s]
100%|██████████| 948/948 [00:01<00:00, 849.21it/s]
100%|██████████| 4920/4920 [00:01<00:00, 3386.07it/s]


{0: 0.40438169856317846, 1: 0.17829739249826282, 2: 0.6939062785593891, 3: -1.5267676840997713, 4: 0.6482899742571356, 5: 1.9271645972128066, 6: -0.012510786952367093, 7: 1.0165337930126177, 8: 0.6408866765677722, 9: 0.8506778469282537, 10: 0.6455244786767814, 11: 0.3265869242493822, 12: 0.6973680863412833, 13: -0.8222554565305124, 14: -0.6993001010808941, 15: 0.006669058992515857, 16: -0.31031228258266036, 17: 0.4990037725757405, 18: 1.064538534468595, 19: -0.49347233220763215, 20: -0.44594125627635306, 21: 0.5234042319926628, 22: -0.445932415451189, 23: 0.5395188736615878, 24: -0.46048387829795595, 25: -1.5675154260652207, 26: -0.06470094403364118, 27: 0.3764785764041615, 28: 0.30292493189489916, 29: -0.44083676240727193, 30: 0.14558775497434948, 31: -0.7210740346650588, 32: 0.7028122218626591, 33: 1.3555710718644542, 34: 0.16502908739894562, 35: 0.008073597236174767, 36: 0.6197256302123214, 37: -0.3836353984356596, 38: 0.18144065077032293, 39: 0.4556071990920633, 40: 0.8448656846199

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{0: 0.40438169856317846, 1: 0.17829739249826282, 2: 0.6939062785593891, 3: -1.5267676840997713, 4: 0.6482899742571356, 5: 1.9271645972128066, 6: -0.012510786952367093, 7: 1.0165337930126177, 8: 0.6408866765677722, 9: 0.8506778469282537, 10: 0.6455244786767814, 11: 0.3265869242493822, 12: 0.6973680863412833, 13: -0.8222554565305124, 14: -0.6993001010808941, 15: 0.006669058992515857, 16: -0.31031228258266036, 17: 0.4990037725757405, 18: 1.064538534468595, 19: -0.49347233220763215, 20: -0.44594125627635306, 21: 0.5234042319926628, 22: -0.445932415451189, 23: 0.5395188736615878, 24: -0.46048387829795595, 25: -1.5675154260652207, 26: -0.06470094403364118, 27: 0.3764785764041615, 28: 0.30292493189489916, 29: -0.44083676240727193, 30: 0.14558775497434948, 31: -0.7210740346650588, 32: 0.7028122218626591, 33: 1.3555710718644542, 34: 0.16502908739894562, 35: 0.008073597236174767, 36: 0.6197256302123214, 37: -0.3836353984356596, 38: 0.18144065077032293, 39: 0.4556071990920633, 40: 0.8448656846199

In [None]:
def get_difficulties():
  filename = '/content/gdrive/MyDrive/Thesis/data/diff_only.json'
  with open(filename) as f:
    data = f.read()
  
  print("Data type before reconstruction : ", type(data))
        
  # reconstructing the data as a dictionary
  js = json.loads(data)
    
  print("Data type after reconstruction : ", type(js))
  print(js)

get_difficulties()

Data type before reconstruction :  <class 'str'>


JSONDecodeError: ignored

In [None]:
from pandas.core.generic import DataFrameFormatter
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/content/gdrive/MyDrive/Thesis/data/error_rates.csv')


# Filter the DataFrame to only include rows where IsCorrect is 1.0
correct_answers = df[df['IsCorrect'] == 1.0]

# Extract the 'QuestionId' column from the filtered DataFrame
question_ids = correct_answers['QuestionId']

# Output the result
print(question_ids)

660    660
841    841
847    847
Name: QuestionId, dtype: int64
