In [10]:
!pip install datasets
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.0-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.7/268.7 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 3.2.1
    Uninstalling sentence-transformers-3.2.1:
      Successfully uninstalled sentence-transformers-3.2.1
Successfully installed sentence-transformers-3.3.0


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#read data
import pandas as pd
import numpy as np
miscon_df = pd.read_csv('misconception_mapping.csv')
train_df = pd.read_csv('train_data.csv')
train_df = train_df.sort_values(by='QuestionId')
test_df = pd.read_csv('test_data.csv')
test_df = test_df.sort_values(by='QuestionId')

In [3]:
#process data to obtain question-answer pairs
def process_df(data_df):
  df = pd.DataFrame()

  misconception_map = pd.Series(miscon_df.MisconceptionName.values, index=miscon_df.MisconceptionId).to_dict()
  rows = []
  for _, row in data_df.iterrows():
      incorrect_answers = {
          'A': (row['AnswerAText'], row['MisconceptionAId']),
          'B': (row['AnswerBText'], row['MisconceptionBId']),
          'C': (row['AnswerCText'], row['MisconceptionCId']),
          'D': (row['AnswerDText'], row['MisconceptionDId'])
      }
      for answer_key in ['A', 'B', 'C', 'D']:
          if answer_key == row['CorrectAnswer']:
              continue

          answer_text, misconception_id = incorrect_answers[answer_key]

          misconception_name = misconception_map.get(misconception_id, "Unknown")

          if not misconception_name == "Unknown":
            rows.append({
                'QuestionId': row['QuestionId'],
                'SubjectName': row['SubjectName'],
                'ConstructName': row['ConstructName'],
                'QuestionText': row['QuestionText'],
                'AnswerText': answer_text,
                'MisconceptionId': misconception_id,
                'MisconceptionName': misconception_name
            })

  df = pd.DataFrame(rows)

  return df

In [4]:
train_df = process_df(train_df)
test_df = process_df(test_df)


In [17]:
print(len(train_df))

3932


In [31]:
#prepare data
from sentence_transformers import InputExample
import pandas as pd


# Create training examples for bi-encoder
train_examples = []
for _, row in train_df.iterrows():
    # Query text: Concatenate question and answer text
    query_text = f"{row['SubjectName']}. {row['ConstructName']}. {row['QuestionText']} [SEP] {row['AnswerText']}"
    positive_example = row['MisconceptionName']

    # Create positive pair
    train_examples.append(InputExample(texts=[query_text, positive_example], label=1.0))

    # Create negative samples by choosing a random misconception that does not match the question
    negative_samples = train_df[train_df['MisconceptionName'] != row['MisconceptionName']].sample(1)
    for _, neg_row in negative_samples.iterrows():
        negative_example = neg_row['MisconceptionName']
        train_examples.append(InputExample(texts=[query_text, negative_example], label=0.0))


In [32]:
#configure model
from sentence_transformers import SentenceTransformer, losses, util
from torch.utils.data import DataLoader
from datasets import Dataset
import logging

# Load the pre-trained bi-encoder model
fine_tune_mini_l6_v2_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Define a DataLoader for training
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Define the training loss (Contrastive loss works well for retrieval)
train_loss = losses.CosineSimilarityLoss(fine_tune_mini_l6_v2_model)

logging.basicConfig(level=logging.INFO)

In [33]:
#train model
fine_tune_mini_l6_v2_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True
)

Step,Training Loss


KeyboardInterrupt: 

In [29]:
fine_tune_mini_l6_v2_model.save('fine_tune_mini_l6_v5_model_30')

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [30]:
import shutil
shutil.make_archive('mini_l6_v5', 'zip', 'fine_tune_mini_l6_v5_model_30')


'/content/mini_l6_v5.zip'

In [27]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = fine_tune_mini_l6_v2_model

def get_miscon_embeddings():
    misconceptions = miscon_df['MisconceptionName'].tolist()

    with torch.no_grad():
        #misconception_embeddings = []
        #for misconception in misconceptions:
            #inputs = tokenizer(misconception, return_tensors="pt", padding=True)
            #embedding = model(**inputs).last_hidden_state.mean(dim=1)  # Mean pooling
            #misconception_embeddings.append(embedding.squeeze().numpy())
        misconception_embeddings = model.encode(misconceptions, convert_to_tensor=True, device='cuda')


    return misconception_embeddings

def bi_encoder_top_25_miscons(row, model, misconception_embeddings):
    query = row['SubjectName'] + '. ' + row['ConstructName'] + '. The question is ' + row['QuestionText'] + ' The student thinks the answer is ' + row['AnswerText']
    with torch.no_grad():
        query_embedding = model.encode(query, convert_to_tensor=True, device='cuda')
    misconception_embeddings = misconception_embeddings.to('cuda')
    #calculate similarities and retrieve top-k misconceptions
    similarities = torch.nn.functional.cosine_similarity(query_embedding, misconception_embeddings)
    top_k_indices = torch.topk(similarities, k=25, largest=True).indices.cpu().numpy()

    actual = row['MisconceptionName']

    return top_k_indices

def apk(actual, predicted, k=25):
    if not actual:
        return 0.0

    actual = [actual]
    #comment below line if predicted is already a list
    #predicted = list(map(int, predicted.split()))

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    print(score / min(len(actual), k))
    return score / min(len(actual), k)

def mapk(actual, predicted, k=25):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

def evaluate_bi_encoder(model, miscon_embeddings):
  contains_count = 0
  mini_l6_v2_preds = []
  actual = []
  for i, row in test_df.iterrows():
    pred_list = bi_encoder_top_25_miscons(row, model, miscon_embeddings)
    mini_l6_v2_preds.append(pred_list)
    actual.append(row['MisconceptionId'])
    if int(row['MisconceptionId']) in pred_list:
      contains_count += 1
  contains_ratio = contains_count / len(test_df)
  print(f'ratio of mini_l6_v2 bi-encoder\'s top 25 containing correct misconception: {contains_ratio}')
  apk_score = mapk(actual, mini_l6_v2_preds)
  print(f'mapk of mini_l6_v2 bi-encoder\'s top 25: {apk_score}')

In [28]:
miscon_embeddings = get_miscon_embeddings()

evaluate_bi_encoder(model, miscon_embeddings)

ratio of mini_l6_v2 bi-encoder's top 25 containing correct misconception: 0.636986301369863
0.05263157894736842
0.16666666666666666
0.05263157894736842
0.09090909090909091
0.07142857142857142
0.0
0.07142857142857142
0.0
0.0
0.0
0.3333333333333333
1.0
0.125
0.3333333333333333
0.2
0.125
0.5
0.08333333333333333
0.0
0.07142857142857142
0.0
0.0
1.0
0.0
0.06666666666666667
0.3333333333333333
0.3333333333333333
0.3333333333333333
1.0
1.0
0.0
0.0
0.0
0.041666666666666664
0.5
0.0
0.058823529411764705
0.045454545454545456
0.08333333333333333
0.045454545454545456
0.0
0.07692307692307693
0.0
1.0
0.5
0.14285714285714285
0.125
0.16666666666666666
0.3333333333333333
0.3333333333333333
0.0
0.0
0.0
0.5
0.5
0.0
0.25
0.2
0.16666666666666666
0.1111111111111111
0.3333333333333333
0.16666666666666666
0.07692307692307693
0.16666666666666666
0.05
0.0
0.043478260869565216
0.3333333333333333
0.0
0.0
0.0
0.0
0.25
0.0
1.0
0.0
0.08333333333333333
0.5
1.0
1.0
1.0
0.3333333333333333
0.08333333333333333
0.33333333333

Use GPU

In [7]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [5]:
from datasets import Dataset


In [6]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import torch
import pandas as pd

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Create training examples for bi-encoder
train_examples = []
for _, row in train_df.iterrows():
    # Query text: Concatenate question and answer text
    query_text = f"{row['SubjectName']}. {row['ConstructName']}. {row['QuestionText']} [SEP] {row['AnswerText']}"
    positive_example = row['MisconceptionName']

    # Create positive pair
    train_examples.append(InputExample(texts=[query_text, positive_example], label=1.0))

    # Create negative samples by choosing a random misconception that does not match the question
    negative_samples = train_df[train_df['MisconceptionName'] != row['MisconceptionName']].sample(1)
    for _, neg_row in negative_samples.iterrows():
        negative_example = neg_row['MisconceptionName']
        train_examples.append(InputExample(texts=[query_text, negative_example], label=0.0))




Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,0.0974
1000,0.0667


In [26]:
# Load the pre-trained bi-encoder model and move it to the GPU
fine_tune_mini_l6_v2_model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L6-cos-v5')
fine_tune_mini_l6_v2_model.to(device)

# Define a DataLoader for training
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Define the training loss (Contrastive loss works well for retrieval)
train_loss = losses.CosineSimilarityLoss(fine_tune_mini_l6_v2_model)

# Fine-tune the model on the GPU
fine_tune_mini_l6_v2_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=30,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True
)

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Step,Training Loss
500,0.1258
1000,0.0812
1500,0.0646
2000,0.0549
2500,0.0477
3000,0.0419
3500,0.0378
4000,0.035
4500,0.032
5000,0.0298


In [18]:
fine_tune_mini_l6_v2_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=10,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True
)

Step,Training Loss
500,0.0209
1000,0.0187
1500,0.0174
2000,0.0157
2500,0.0143
3000,0.0133
3500,0.0126
4000,0.0119
4500,0.011


In [22]:
fine_tune_mini_l6_v2_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=10,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True,
    optimizer_params={'lr': 1e-5}
)


Step,Training Loss
500,0.0123
1000,0.0115
1500,0.0113
2000,0.0105
2500,0.0099
3000,0.0097
3500,0.0094
4000,0.0089
4500,0.0086
