In [1]:
!pip install datasets
!pip install -U sentence-transformers

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#read data
import pandas as pd
import numpy as np
miscon_df = pd.read_csv('misconception_mapping.csv')
train_df = pd.read_csv('train_data.csv')
train_df = train_df.sort_values(by='QuestionId')
test_df = pd.read_csv('test_data.csv')
test_df = test_df.sort_values(by='QuestionId')

In [4]:
#process data to obtain question-answer pairs
def process_df(data_df):
  df = pd.DataFrame()

  misconception_map = pd.Series(miscon_df.MisconceptionName.values, index=miscon_df.MisconceptionId).to_dict()
  rows = []
  for _, row in data_df.iterrows():
      incorrect_answers = {
          'A': (row['AnswerAText'], row['MisconceptionAId']),
          'B': (row['AnswerBText'], row['MisconceptionBId']),
          'C': (row['AnswerCText'], row['MisconceptionCId']),
          'D': (row['AnswerDText'], row['MisconceptionDId'])
      }
      for answer_key in ['A', 'B', 'C', 'D']:
          if answer_key == row['CorrectAnswer']:
              continue

          answer_text, misconception_id = incorrect_answers[answer_key]

          misconception_name = misconception_map.get(misconception_id, "Unknown")

          if not misconception_name == "Unknown":
            rows.append({
                'QuestionId': row['QuestionId'],
                'SubjectName': row['SubjectName'],
                'ConstructName': row['ConstructName'],
                'QuestionText': row['QuestionText'],
                'AnswerText': answer_text,
                'MisconceptionId': misconception_id,
                'MisconceptionName': misconception_name
            })

  df = pd.DataFrame(rows)

  return df

In [5]:
train_df = process_df(train_df)
test_df = process_df(test_df)

In [32]:
import random

def get_random_unseen_misconception(column_name, keyword, df):
    seen_misconceptions = set(df[df[column_name] == keyword]['MisconceptionName'])

    all_misconceptions = set(df['MisconceptionName'].unique())

    unseen_misconceptions = all_misconceptions - seen_misconceptions

    if unseen_misconceptions:
        return list(unseen_misconceptions)
    else:
        return []

In [23]:
#prepare data
from sentence_transformers import InputExample
import pandas as pd


subject_train_examples = []
for _, row in train_df.iterrows():
    query_text = f"{row['SubjectName']}."
    positive_example = row['MisconceptionName']

    subject_train_examples.append(InputExample(texts=[query_text, positive_example], label=1.0))

    unseen_miscons = get_random_unseen_misconception('SubjectName', row['SubjectName'], train_df)
    negative_samples = random.sample(unseen_miscons, min(len(unseen_miscons), 3))
    for neg_sub in negative_samples:
        subject_train_examples.append(InputExample(texts=[query_text, neg_sub], label=0.0))


In [24]:
from sentence_transformers import SentenceTransformer, losses, util
from torch.utils.data import DataLoader
from datasets import Dataset
import logging

In [25]:
subject_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

train_dataloader = DataLoader(subject_train_examples, shuffle=True, batch_size=16)

train_loss = losses.CosineSimilarityLoss(subject_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [26]:
#train model
subject_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=20,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,0.0909
1000,0.0713
1500,0.0572
2000,0.0546
2500,0.0472
3000,0.0467
3500,0.0401
4000,0.0415
4500,0.0359
5000,0.0374


In [33]:
construct_train_examples = []
for _, row in train_df.iterrows():
    query_text = f"{row['ConstructName']}."
    positive_example = row['MisconceptionName']

    construct_train_examples.append(InputExample(texts=[query_text, positive_example], label=1.0))

    unseen_miscons = get_random_unseen_misconception('ConstructName', row['ConstructName'], train_df)
    negative_samples = random.sample(unseen_miscons, min(len(unseen_miscons), 3))
    for neg_sub in negative_samples:
        construct_train_examples.append(InputExample(texts=[query_text, neg_sub], label=0.0))
construct_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

train_dataloader = DataLoader(construct_train_examples, shuffle=True, batch_size=16)

train_loss = losses.CosineSimilarityLoss(construct_model)

#train model
construct_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=20,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True
)

Step,Training Loss
500,0.0843
1000,0.0612
1500,0.0556
2000,0.047
2500,0.0455
3000,0.0395
3500,0.0387
4000,0.0341
4500,0.0338
5000,0.0312


In [180]:
train_examples = []
for _, row in train_df.iterrows():
    query_text = f"{row['SubjectName']}. {row['ConstructName']}. {row['QuestionText']} [SEP] {row['AnswerText']}"
    positive_example = row['MisconceptionName']

    train_examples.append(InputExample(texts=[query_text, positive_example], label=1.0))

    #hard negative: from same question but different answer
    #hard_neg = train_df[(train_df['QuestionText'] == row['QuestionText']) & (train_df['MisconceptionName'] != row['MisconceptionName'])]
    #for _, hard_neg_row in hard_neg.iterrows():
        #hard_negative_example = hard_neg_row['MisconceptionName']
        #train_examples.append(InputExample(texts=[query_text, hard_negative_example], label=0.0))
        #print('appended hard neg')
    #random negative: from different questions
    rand_neg = train_df[(train_df['QuestionText'] != row['QuestionText']) & (train_df['MisconceptionName'] != row['MisconceptionName'])].sample(10)
    for _, rand_neg_row in rand_neg.iterrows():
        rand_negative_example = rand_neg_row['MisconceptionName']
        train_examples.append(InputExample(texts=[query_text, rand_negative_example], label=0.0))
        #print('appended rand neg')


In [122]:
all_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

train_loss = losses.CosineSimilarityLoss(all_model)

#train model
all_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=20,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True
)

Step,Training Loss
500,0.0453
1000,0.0377
1500,0.0337
2000,0.0337
2500,0.0305
3000,0.0319
3500,0.027
4000,0.0251
4500,0.0265
5000,0.0242


KeyboardInterrupt: 

In [131]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)

all_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True,
    optimizer_params={'lr': 1e-5}
)

Step,Training Loss
500,0.0091
1000,0.0099
1500,0.01
2000,0.0088
2500,0.0092
3000,0.0087
3500,0.0087
4000,0.0083


In [112]:
question_train_examples = []
for _, row in train_df.iterrows():
    query_text = f"{row['QuestionText']} [SEP] {row['AnswerText']}"
    positive_example = row['MisconceptionName']

    question_train_examples.append(InputExample(texts=[query_text, positive_example], label=1.0))

    #hard negative: from same question but different answer
    #hard_neg = train_df[(train_df['QuestionText'] == row['QuestionText']) & (train_df['MisconceptionName'] != row['MisconceptionName'])]
    #for _, hard_neg_row in hard_neg.iterrows():
        #hard_negative_example = hard_neg_row['MisconceptionName']
        #question_train_examples.append(InputExample(texts=[query_text, hard_negative_example], label=0.0))
        #print('appended hard neg')
    #random negative: from different questions
    rand_neg = train_df[(train_df['QuestionText'] != row['QuestionText']) & (train_df['MisconceptionName'] != row['MisconceptionName'])].sample(3)
    for _, rand_neg_row in rand_neg.iterrows():
        rand_negative_example = rand_neg_row['MisconceptionName']
        question_train_examples.append(InputExample(texts=[query_text, rand_negative_example], label=0.0))
        #print('appended rand neg')


question_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

train_dataloader = DataLoader(question_train_examples, shuffle=True, batch_size=16)

train_loss = losses.CosineSimilarityLoss(question_model)

#train model
question_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=20,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg
appended rand neg

Step,Training Loss
500,0.0883
1000,0.0662
1500,0.0556
2000,0.0492
2500,0.0447
3000,0.0407
3500,0.0371
4000,0.035
4500,0.0326
5000,0.0306


In [60]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def apk(actual, predicted, k=25):
    if not actual:
        return 0.0

    actual = [actual]

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    #print(score / min(len(actual), k))
    return score / min(len(actual), k)

def mapk(actual, predicted, k=25):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [182]:
def bi_encoder_top_25_miscons(row, model, misconception_embeddings):
    query = row['SubjectName'] + '. ' + row['ConstructName'] + '. The question is ' + row['QuestionText'] + ' The student thinks the answer is ' + row['AnswerText']
    with torch.no_grad():
        query_embedding = model.encode(query, convert_to_tensor=True, device='cuda')
    misconception_embeddings = misconception_embeddings.to('cuda')
    #calculate similarities and retrieve top-k misconceptions
    similarities = torch.nn.functional.cosine_similarity(query_embedding, misconception_embeddings)
    top_k_indices = torch.topk(similarities, k=25, largest=True).indices.cpu().numpy()
    #print(torch.topk(similarities, k=25, largest=True))
    return top_k_indices
    #topk = similarities.topk(25)
    #return list(zip(topk.values.tolist(), topk.indices.tolist()))

def evaluate_bi_encoder(model):
  contains_count = 0
  mini_l6_v2_preds = []
  actual = []
  misconceptions = miscon_df['MisconceptionName'].tolist()
  with torch.no_grad():
    misconception_embeddings = model.encode(misconceptions, convert_to_tensor=True, device='cuda')

  for i, row in test_df.iterrows():
    pred_list = bi_encoder_top_25_miscons(row, model, misconception_embeddings)
    mini_l6_v2_preds.append(pred_list)
    actual.append(row['MisconceptionId'])
    if int(row['MisconceptionId']) in pred_list:
      contains_count += 1
  contains_ratio = contains_count / len(test_df)
  print(f'ratio of mini_l6_v2 bi-encoder\'s top 25 containing correct misconception: {contains_ratio}')
  apk_score = mapk(actual, mini_l6_v2_preds)
  print(f'mapk of mini_l6_v2 bi-encoder\'s top 25: {apk_score}')
  return mini_l6_v2_preds

In [31]:
evaluate_bi_encoder(subject_model)

ratio of mini_l6_v2 bi-encoder's top 25 containing correct misconception: 0.5593607305936074
0.0
0.0
0.0
0.0
0.0
0.04
0.16666666666666666
0.0
0.05263157894736842
0.0
1.0
1.0
0.0
0.2
0.0
0.045454545454545456
0.0
0.5
0.0
0.0
0.0
0.0
0.5
0.047619047619047616
0.045454545454545456
0.16666666666666666
0.1
0.0
0.16666666666666666
0.14285714285714285
0.0
0.0
0.0
0.0
0.06666666666666667
0.3333333333333333
0.25
0.3333333333333333
0.043478260869565216
0.3333333333333333
0.0
0.0
1.0
0.16666666666666666
0.0625
0.3333333333333333
0.0
0.1
0.125
0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.5
0.1
0.0
0.1111111111111111
0.0
0.05263157894736842
0.5
0.08333333333333333
0.2
0.0
0.043478260869565216
0.0
0.0
0.0
0.0
0.0
0.07142857142857142
0.0
0.0
0.045454545454545456
0.07142857142857142
0.2
0.5
1.0
1.0
0.25
0.0
0.3333333333333333
0.0
0.0
0.0
0.0
0.0
0.5
0.0
0.0
0.05263157894736842
0.1
0.2
0.0
0.0
0.0
0.0
0.0
0.0
0.3333333333333333
0.08333333333333333
0.0625
0.16666666666666666
0.1
0.0
1.0
0.045454545454545456
0.1666666666

In [34]:
evaluate_bi_encoder(construct_model)

ratio of mini_l6_v2 bi-encoder's top 25 containing correct misconception: 0.6095890410958904
0.0
0.2
0.0
0.5
0.09090909090909091
0.09090909090909091
0.2
0.058823529411764705
0.0
0.0
1.0
1.0
0.2
0.5
0.0
0.5
0.0
0.25
0.1111111111111111
0.0
0.0
0.0
0.125
0.0
0.1
0.25
0.08333333333333333
0.3333333333333333
0.043478260869565216
0.045454545454545456
0.5
0.0
0.1111111111111111
0.125
0.07692307692307693
0.0
0.05
0.0
0.05
0.0
0.0
0.0
0.3333333333333333
1.0
0.07142857142857142
0.125
0.0
0.2
0.2
0.2
0.0
0.0
0.0
0.07692307692307693
0.07692307692307693
0.0
0.06666666666666667
0.14285714285714285
0.0
0.14285714285714285
0.25
0.05555555555555555
0.08333333333333333
0.2
0.16666666666666666
0.0
0.2
0.25
0.0
0.0
0.16666666666666666
0.0
0.125
0.0
0.0
0.0
0.0
0.07142857142857142
1.0
1.0
0.2
1.0
0.0
0.14285714285714285
0.0
0.0
0.0
0.0
0.0
0.25
0.0
0.0
0.14285714285714285
1.0
0.07692307692307693
0.058823529411764705
0.0
0.0
0.0
0.0
0.06666666666666667
1.0
0.3333333333333333
0.3333333333333333
0.04
0.0
0.0
1

In [41]:
evaluate_bi_encoder(question_model)

ratio of mini_l6_v2 bi-encoder's top 25 containing correct misconception: 0.5091324200913242
0.0
1.0
0.2
0.0
0.0
0.0
0.0
0.07142857142857142
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.16666666666666666
1.0
0.0
0.0
0.0
0.0
0.07142857142857142
0.04
0.06666666666666667
1.0
1.0
0.0
0.047619047619047616
0.1111111111111111
0.125
0.0
0.1111111111111111
0.1
0.043478260869565216
0.0
0.1111111111111111
0.0
0.0
0.0
0.0
0.05555555555555555
1.0
0.1111111111111111
0.1111111111111111
0.3333333333333333
0.045454545454545456
0.5
0.5
0.5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.041666666666666664
0.041666666666666664
1.0
0.125
0.5
0.047619047619047616
0.0
0.0
0.0
0.2
0.0
0.0
0.0
0.0
0.5
0.07692307692307693
0.0
1.0
0.25
0.5
1.0
1.0
1.0
0.5
0.0
0.0
0.0
0.16666666666666666
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.2
0.3333333333333333
0.16666666666666666
0.125
0.14285714285714285
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.05263157894736842
1.0
0.14285714285714285
0.25
0.1111111111111111
0.0
0.0
1.0
0.0
0.25
0.25
0.16666666666666666
0.111111111111

In [117]:
evaluate_bi_encoder(all_model)

ratio of mini_l6_v2 bi-encoder's top 25 containing correct misconception: 0.7123287671232876
mapk of mini_l6_v2 bi-encoder's top 25: 0.24969859320427115


In [123]:
evaluate_bi_encoder(all_model)

ratio of mini_l6_v2 bi-encoder's top 25 containing correct misconception: 0.7100456621004566
mapk of mini_l6_v2 bi-encoder's top 25: 0.2653534574475486


In [127]:
evaluate_bi_encoder(all_model)

ratio of mini_l6_v2 bi-encoder's top 25 containing correct misconception: 0.7420091324200914
mapk of mini_l6_v2 bi-encoder's top 25: 0.2833203871698562


In [132]:
evaluate_bi_encoder(all_model)

ratio of mini_l6_v2 bi-encoder's top 25 containing correct misconception: 0.7397260273972602
mapk of mini_l6_v2 bi-encoder's top 25: 0.28874866376965475


In [183]:
model_preds = evaluate_bi_encoder(all_model)

ratio of mini_l6_v2 bi-encoder's top 25 containing correct misconception: 0.7397260273972602
mapk of mini_l6_v2 bi-encoder's top 25: 0.28874866376965475


In [184]:
print(model_preds)

[array([ 122, 1955, 1759, 1914,  329, 1622,  800, 2510, 1270, 1822, 1786,
       1033, 2361,  292,  658, 1341, 1195,  518,  597, 1643,  847, 2191,
       2206, 1756,  988]), array([ 122, 1759, 1914, 1955, 1822, 1786, 1622,  292, 1270,  329,  800,
       2510, 2361, 1033,  658, 1341, 1195,  518,  597, 1643,  988, 1756,
        847,   67, 2191]), array([ 804, 1955, 1759,  122, 1033, 1914, 2361,  318,  292,  329, 1786,
       1622,  847,  658, 1270, 1822,  800, 2510, 1396, 1341, 2363,  145,
        597,  518, 2191]), array([ 808, 1252, 1333,  653,  590, 1985,  159, 1292, 1325, 2072, 1780,
         71,  881,  828, 1678,   18,  982,  584,  454, 1802, 1295, 1565,
       1996,  709, 2090]), array([ 808, 1252, 1333,  653, 1985,  590,  159, 1292, 1325, 2072, 1780,
         71,  881,  828,   18, 1678,  982,  584,  454, 1802, 1295, 1565,
       1996,  709, 2090]), array([  38, 1829,  811, 2043, 1590,  863, 1064,  742,  981, 1682,   84,
        214, 1771, 1880, 1828, 1754, 1612, 1162,  550, 1660, 

In [128]:
all_model.save('bi-encoder')

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [129]:
import shutil
shutil.make_archive('bi-encoder', 'zip', 'bi-encoder')

'/content/bi-encoder.zip'

In [52]:

bi_encoder_top_25_miscons(test_df.iloc[0], subject_model, misconception_embeddings1)

torch.return_types.topk(
values=tensor([0.9788, 0.9767, 0.9766, 0.9755, 0.9751, 0.9751, 0.9745, 0.9740, 0.9739,
        0.9738, 0.9731, 0.9731, 0.9728, 0.9720, 0.9717, 0.9710, 0.9710, 0.9702,
        0.9691, 0.9691, 0.9687, 0.9686, 0.9685, 0.9678, 0.9661],
       device='cuda:0'),
indices=tensor([1965, 1973,  800,  145, 1341,   67, 1756,   39, 2361, 1270, 1622, 1786,
         122,   61, 2039,  988,  318, 1033,  292, 1759, 1914,  364, 1632, 2138,
        1710], device='cuda:0'))


In [55]:
misconceptions = miscon_df['MisconceptionName'].tolist()
with torch.no_grad():
  misconception_embeddings1 = subject_model.encode(misconceptions, convert_to_tensor=True, device='cuda')
  misconception_embeddings2 = construct_model.encode(misconceptions, convert_to_tensor=True, device='cuda')
  misconception_embeddings3 = question_model.encode(misconceptions, convert_to_tensor=True, device='cuda')
subject_model_scores = []
construct_model_scores = []
question_model_scores = []
actual = []
for i, row in test_df.iterrows():
  subject_model_scores.append(bi_encoder_top_25_miscons(row, subject_model, misconception_embeddings1))
  construct_model_scores.append(bi_encoder_top_25_miscons(row, construct_model, misconception_embeddings2))
  question_model_scores.append(bi_encoder_top_25_miscons(row, question_model, misconception_embeddings3))
  actual.append([int(row['MisconceptionId'])])

In [57]:
print(subject_model_scores[0])

[(0.9788292646408081, 1965), (0.9767197966575623, 1973), (0.9766165018081665, 800), (0.9754557013511658, 145), (0.9751241207122803, 1341), (0.9750621318817139, 67), (0.9744537472724915, 1756), (0.9739596247673035, 39), (0.9739352464675903, 2361), (0.97381991147995, 1270), (0.9731130003929138, 1622), (0.9730510711669922, 1786), (0.9727861285209656, 122), (0.9719735383987427, 61), (0.9717446565628052, 2039), (0.9710230231285095, 988), (0.9710031747817993, 318), (0.970203697681427, 1033), (0.9691189527511597, 292), (0.9691048860549927, 1759), (0.9686893820762634, 1914), (0.9686007499694824, 364), (0.9685181379318237, 1632), (0.9677509069442749, 2138), (0.9660635590553284, 1710)]


In [63]:
print(len(subject_model_scores))

438


In [67]:
actual = []
for i, row in test_df.iterrows():
  actual.append(int(row['MisconceptionId']))

In [68]:
print(actual)

[329, 847, 329, 590, 71, 1214, 811, 1214, 218, 961, 1322, 1322, 779, 990, 969, 132, 11, 1743, 1324, 1214, 2093, 1648, 1876, 1402, 1605, 838, 838, 271, 2392, 2392, 322, 1093, 52, 1631, 2376, 936, 1240, 1743, 1868, 1743, 2126, 1720, 2472, 1522, 650, 1561, 2329, 1990, 1990, 1990, 955, 1708, 955, 373, 373, 220, 1133, 694, 332, 2365, 1707, 725, 893, 1492, 20, 220, 982, 828, 2517, 307, 2316, 2532, 1082, 1510, 1795, 2346, 1452, 2239, 1322, 2355, 2355, 926, 1417, 130, 1214, 181, 110, 2093, 1648, 159, 1319, 1663, 1358, 1825, 606, 1639, 971, 1631, 315, 260, 438, 656, 1026, 1026, 2262, 1452, 888, 1598, 22, 2395, 2375, 2237, 2185, 1706, 843, 843, 843, 1730, 1730, 1815, 545, 255, 1809, 2542, 1705, 328, 1507, 1746, 1746, 1746, 2316, 2481, 800, 2361, 2361, 619, 619, 329, 1582, 1554, 1554, 1554, 19, 2082, 754, 1872, 1872, 894, 2511, 780, 8, 114, 2362, 15, 1507, 1287, 1287, 1073, 946, 1329, 1524, 845, 2027, 379, 212, 1990, 1990, 1990, 2417, 2417, 1976, 1035, 151, 2350, 1825, 1825, 28, 455, 1426, 1214, 

In [83]:
from scipy.optimize import minimize, differential_evolution

model_1_scores = subject_model_scores
model_2_scores = construct_model_scores
model_3_scores = question_model_scores

def objective(weights):
    w1, w2, w3 = weights
    combined_predictions = []

    for row_scores_1, row_scores_2, row_scores_3 in zip(model_1_scores, model_2_scores, model_3_scores):
        weighted_scores = {}

        for score, misconception_id in row_scores_1:
            weighted_scores[misconception_id] = weighted_scores.get(misconception_id, 0) + w1 * score
        for score, misconception_id in row_scores_2:
            weighted_scores[misconception_id] = weighted_scores.get(misconception_id, 0) + w2 * score
        for score, misconception_id in row_scores_3:
            weighted_scores[misconception_id] = weighted_scores.get(misconception_id, 0) + w3 * score

        sorted_items = sorted(weighted_scores, key=weighted_scores.get, reverse=True)
        combined_predictions.append(sorted_items[:25])
    #print(combined_predictions)

    #ret = 0
    #for i in range(len(actual)):
        #ret -= mapk(actual[i], combined_predictions[i], k=25)
    return -mapk(actual, combined_predictions, k=25)

# Initial weights and constraints
initial_weights = [0.1, 0.3, 0.6]  # Start with equal weights
constraints = [{'type': 'eq', 'fun': lambda w: np.sum(w) - 1}]  # Weights should sum to 1
bounds = [(0, 1), (0, 1), (0, 1)]  # Each weight should be between 0 and 1

# Optimize weights
#result = minimize(objective, initial_weights, bounds=bounds, constraints=constraints)
result = minimize(objective, initial_weights, bounds=bounds, constraints=constraints, method='SLSQP')

optimal_weights = result.x

print("Optimal weights:", optimal_weights)
print("Optimal MAP@25 score:", -result.fun)

ValueError: `constraint` of an unknown type is passed.

In [88]:
print(objective((0.3, 0.3, 0.3)))

-0.22627920360305662


In [104]:
model_weights = {
    'model_1': [0.0, 0.0, 0.0],
    'model_2': [0.0, 0.0, 0.0],
    'model_3': [0.3, 0.3, 0.3]
}

print(objective(model_weights))

-0.19071141326802918


In [91]:
model_weights = {
    'model_1': [0.5, 0.3, 0.2],  # Weights for top 5, next 10, and final 10 positions
    'model_2': [0.4, 0.3, 0.3],
    'model_3': [0.2, 0.4, 0.4]
}

model_1_scores = subject_model_scores
model_2_scores = construct_model_scores
model_3_scores = question_model_scores

def objective(model_weights):
    combined_predictions = []

    for row_scores_1, row_scores_2, row_scores_3 in zip(model_1_scores, model_2_scores, model_3_scores):
        weighted_scores = {}
        i = 0
        for score, misconception_id in row_scores_1:
            if i < 5:
                group = 0
            elif i < 15:
                group = 1
            else:
                group = 2
            weighted_scores[misconception_id] = weighted_scores.get(misconception_id, 0) + model_weights['model_1'][group] * score
            i+=1
        i = 0
        for score, misconception_id in row_scores_2:
            if i < 5:
                group = 0
            elif i < 15:
                group = 1
            else:
                group = 2
            weighted_scores[misconception_id] = weighted_scores.get(misconception_id, 0) + model_weights['model_2'][group] * score
            i+=1
        i = 0
        for score, misconception_id in row_scores_3:
            if i < 5:
                group = 0
            elif i < 15:
                group = 1
            else:
                group = 2
            weighted_scores[misconception_id] = weighted_scores.get(misconception_id, 0) + model_weights['model_3'][group] * score
            i+=1

        sorted_items = sorted(weighted_scores, key=weighted_scores.get, reverse=True)
        combined_predictions.append(sorted_items[:25])
    #print(combined_predictions)

    #ret = 0
    #for i in range(len(actual)):
        #ret -= mapk(actual[i], combined_predictions[i], k=25)
    return -mapk(actual, combined_predictions, k=25)

# Initial weights and constraints
initial_weights = model_weights  # Start with equal weights
constraints = [{'type': 'eq', 'fun': lambda w: np.sum(w) - 1}]  # Weights should sum to 1
bounds = [(0, 1), (0, 1), (0, 1)]  # Each weight should be between 0 and 1

# Optimize weights
result = minimize(objective, initial_weights, bounds=bounds, constraints=constraints)
#result = minimize(objective, initial_weights, bounds=bounds, constraints=constraints, method='SLSQP')

optimal_weights = result.x

print("Optimal weights:", optimal_weights)
print("Optimal MAP@25 score:", -result.fun)

ValueError: The number of bounds is not compatible with the length of `x0`.

In [142]:
print(len(train_examples))

43252


In [176]:
ce_train_examples = []
for _, row in train_df.iterrows():
    query_text = f"{row['SubjectName']}. {row['ConstructName']}. {row['QuestionText']} [SEP] {row['AnswerText']}"
    positive_example = row['MisconceptionName']

    ce_train_examples.append(InputExample(texts=[query_text, positive_example], label=1.0))

    #hard negative: from same question but different answer
    #hard_neg = train_df[(train_df['QuestionText'] == row['QuestionText']) & (train_df['MisconceptionName'] != row['MisconceptionName'])]
    #for _, hard_neg_row in hard_neg.iterrows():
        #hard_negative_example = hard_neg_row['MisconceptionName']
        #train_examples.append(InputExample(texts=[query_text, hard_negative_example], label=0.0))
        #print('appended hard neg')
    #random negative: from different questions
    rand_neg = train_df[(train_df['QuestionText'] != row['QuestionText']) & (train_df['MisconceptionName'] != row['MisconceptionName'])].sample(10)
    for _, rand_neg_row in rand_neg.iterrows():
        rand_negative_example = rand_neg_row['MisconceptionName']
        ce_train_examples.append(InputExample(texts=[query_text, rand_negative_example], label=0.0))
        #print('appended rand neg')

In [177]:
#fine tune cross encoder

from sentence_transformers import CrossEncoder

#train_examples = [
    #InputExample(texts=[query, answer], label=score) for query, answer, score in train_data
#]

#ce_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', num_labels=1)

train_dataloader = DataLoader(ce_train_examples, shuffle=True, batch_size=16)

ce_model.fit(
    train_dataloader=train_dataloader,
    epochs=3,
    warmup_steps=int(len(train_dataloader) * 0.1)
)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2704 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2704 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2704 [00:00<?, ?it/s]

In [173]:
ce_hardneg_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', num_labels=1)
train_dataloader = DataLoader(ce_train_examples, shuffle=True, batch_size=16)


ce_hardneg_model.fit(
    train_dataloader=train_dataloader,
    epochs=6,
    warmup_steps=int(len(train_dataloader) * 0.1)
)

Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/983 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Iteration:   0%|          | 0/983 [00:00<?, ?it/s]

Iteration:   0%|          | 0/983 [00:00<?, ?it/s]

Iteration:   0%|          | 0/983 [00:00<?, ?it/s]

Iteration:   0%|          | 0/983 [00:00<?, ?it/s]

Iteration:   0%|          | 0/983 [00:00<?, ?it/s]

In [178]:
from transformers import AutoModelForSequenceClassification

def cross_encoder_rank_miscons(row, misconceptions, model_name):
    #model_name = "cross-encoder/ms-marco-MiniLM-L-12-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    #model = ce_model#AutoModelForSequenceClassification.from_pretrained(model_name)

    scores = []
    inputs = []
    for misconception_idx in misconceptions:
        misconception_text = miscon_df['MisconceptionName'].tolist()[misconception_idx]
        query = f"[Question] {row['SubjectName'] + '. ' + row['ConstructName'] + '. ' + row['QuestionText']} [SEP] [Wrong Answer] {row['AnswerText']}"
        candidate = f"[Misconception] {misconception_text}"
        #inputs = tokenizer(query, return_tensors="pt")
        inputs.append([query,candidate])

    with torch.no_grad():
        scores = ce_model.predict(inputs)#(**inputs)
        #print(scores)
            #score = outputs.logits.item()
            #scores.append((misconception_idx, score))
    misconception_score_pairs = list(zip(misconceptions, scores))
    top_25_pairs = sorted(misconception_score_pairs, key=lambda x: x[1], reverse=True)[:25]
    top_25_misconceptions = [misconception for misconception, score in top_25_pairs]

    return top_25_misconceptions
    #ranked_misconceptions = sorted(scores, key=lambda x: x[1], reverse=True)
    #return [item[0] for item in ranked_misconceptions]


In [170]:
def evaluate_cross_encoder(model_name, sample_size):
  model_preds = []
  actual = []
  with torch.no_grad():
    misconception_embeddings = all_model.encode(misconceptions, convert_to_tensor=True, device='cuda')
  for i, row in test_df.iterrows():
    pred_list = bi_encoder_top_25_miscons(row, all_model, misconception_embeddings)
    #skip cross encoder if bi encoder res didn't even include correct answer to save run time
    if int(row['MisconceptionId']) in pred_list:
      cross_encoder_ranked = cross_encoder_rank_miscons(row, pred_list, model_name)
      #print(cross_encoder_ranked)
      #print(int(row['MisconceptionId']))
      model_preds.append(cross_encoder_ranked)
    else:
      #print('not in list')
      model_preds.append([-1])
    actual.append(int(row['MisconceptionId']))
  apk_score = mapk(actual, model_preds)
  print(f'mapk of mini_l6_v2 bi-encoder\'s top 25: {apk_score}')

In [175]:
evaluate_cross_encoder('cross-encoder/stsb-roberta-base', 10)

mapk of mini_l6_v2 bi-encoder's top 25: 0.23960732585578615


In [179]:
evaluate_cross_encoder('cross-encoder/stsb-roberta-base', 10)

mapk of mini_l6_v2 bi-encoder's top 25: 0.2754398450247511
