<a href="https://colab.research.google.com/github/csabi0312/DeepLProject/blob/main/embedding_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Packages

In [None]:
!pip install faiss-gpu sentence-transformers
!pip install datasets



In [None]:
import random
import numpy as np
import pandas as pd
import faiss
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import torch
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModel, AutoConfig
from keras.utils import to_categorical
from keras.models import Model, load_model
from keras.layers import Input, Flatten, Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard


# Setting the random seed
seed_value = 42
random.seed(seed_value)

#Data manipulations

In [None]:
#Loading the questions
# Ez nem egy masik dataset, nem amit a mi kodunk csinal, csak tesztelni hasznalom hogy jo e a modell
qna_df = pd.read_csv('https://raw.githubusercontent.com/emmermarcell/DeepLProject/main/train_with_context2.csv')

# Creating a dictionary to map the values to numbers
mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}

# Replacing the values in the 'answer' column
qna_df['answer'] = qna_df['answer'].replace(mapping)
qna_df.head()

Unnamed: 0_level_0,prompt,A,B,C,D,E,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,3
1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,0
2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,0
3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,2
4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,3


## Creating  context column from wikipedia articles using the Faiss library

In [None]:
#Train-Val-Test split:
# Splitting the DataFrame into training, validation, and test datasets with a 2:1:1 ratio
train, temp = train_test_split(qna_df, test_size=0.5, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

print(len(train))
print(len(val))
print(len(test))

100
50
50


The following code snippet acquires embeddings for the questions and possible answers using sciBERT

In [None]:
# Load the SciBERT tokenizer and model
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

max_sequence_length = config.max_position_embeddings
print(f"Maximum Sequence Length: {max_sequence_length}")


def get_embeddings(prompt, context, answers):
    """
    Function to get embeddings for a prompt, context and its corresponding answers.
    The function returns a tensor of shape (7, 768),
    where 7 is the number of sentences (1 prompt + 1 context + 5 answers)
    and 768 is the embedding dimension.
    """
    # List to store embeddings
    embeddings = []

    # Get embedding for the question
    prompt_embedding = get_embedding(prompt)
    embeddings.append(prompt_embedding)

    # Get embedding for the context
    context_embedding = get_embedding(context)
    embeddings.append(context_embedding)

    # Get embeddings for each answer
    for answer in answers:
        answer_embedding = get_embedding(answer)
        embeddings.append(answer_embedding)

    # Return stacked embeddings
    return torch.stack(embeddings)


def get_embedding(text):
    """
    Function to get embedding for a text. The function returns the second to
    last hidden state of the token `[CLS]` for classification task.
    :param text:
    :return:
    """
    # Tokenization and padding
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_sequence_length)
    with torch.no_grad():
        # Model Inference (pass the text to the model)
        outputs = bert_model(**inputs)
    # Extract the second to last hidden state of the token `[CLS]` for classification task
    return outputs.last_hidden_state.mean(dim=1).squeeze()

Maximum Sequence Length: 512


In [None]:
def convert_dataframe_to_model_input(df: pd.DataFrame):
    df_copy = df.copy()

    # Get embeddings for prompts and answers
    X = []
    for _, row in df_copy.iterrows():
        prompt = row['prompt']
        context = row['context']
        answers = [row['A'], row['B'], row['C'], row['D'], row['E']]

        embeddings = get_embeddings(prompt, context, answers)
        X.append(embeddings)

    X = torch.stack(X)
    X = X.view(-1, 7, 768)  # Reshape the tensor to the desired shape

    # Convert PyTorch tensor to NumPy array
    X_np = X.cpu().numpy()

    # Making a categorical variable for the target
    y = df['answer']
    y_cat = to_categorical(y, 5)

    return X_np, y_cat

In [None]:
X_train, y_train = convert_dataframe_to_model_input(train)
X_val, y_val = convert_dataframe_to_model_input(val)
X_test, y_test = convert_dataframe_to_model_input(test)

In [None]:
# Define a simple ranking neural network using Keras
input_layer = Input(shape=(7, 768))
flatten_layer = Flatten()(input_layer)
dense_1 = Dense(128, activation='relu')(flatten_layer)
output_layer = Dense(5, activation='softmax')(dense_1)

# Create the model
ranker_model = Model(inputs=input_layer, outputs=output_layer)
ranker_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])  # Use appropriate optimizer and loss

# Use early stopping to get the best valuation loss
path_checkpoint = "scibert_weights.h5"
es_callback = EarlyStopping(monitor="val_loss", min_delta=0, patience=20, verbose=1)

# Use ModelCheckpoint without the 'options' argument
modelckpt_callback = ModelCheckpoint(
    monitor="val_loss",
    filepath=path_checkpoint,
    verbose=1,
    save_best_only=True,
)

# Use tensorboard to visualize the learning
tensorboard_callback = TensorBoard(log_dir="./logs", write_graph=True, histogram_freq=1)

# Train the model
ranker_model.fit(X_train,
                 y_train,
                 validation_data=(X_val, y_val),
                 epochs=100,
                 batch_size=16,
                 callbacks=[es_callback,
                            modelckpt_callback,
                            tensorboard_callback])

Epoch 1/100
1/7 [===>..........................] - ETA: 14s - loss: 1.7874 - accuracy: 0.1875
Epoch 1: val_loss improved from inf to 2.57483, saving model to scibert_weights.h5


  saving_api.save_model(


Epoch 2/100
1/7 [===>..........................] - ETA: 0s - loss: 1.7195 - accuracy: 0.4375
Epoch 2: val_loss improved from 2.57483 to 1.80949, saving model to scibert_weights.h5
Epoch 3/100
1/7 [===>..........................] - ETA: 0s - loss: 1.2686 - accuracy: 0.5000
Epoch 3: val_loss did not improve from 1.80949
Epoch 4/100
1/7 [===>..........................] - ETA: 0s - loss: 0.7456 - accuracy: 0.8125
Epoch 4: val_loss did not improve from 1.80949
Epoch 5/100
1/7 [===>..........................] - ETA: 0s - loss: 0.8233 - accuracy: 0.8125
Epoch 5: val_loss did not improve from 1.80949
Epoch 6/100
1/7 [===>..........................] - ETA: 0s - loss: 0.6599 - accuracy: 0.8750
Epoch 6: val_loss did not improve from 1.80949
Epoch 7/100
1/7 [===>..........................] - ETA: 0s - loss: 0.4078 - accuracy: 0.9375
Epoch 7: val_loss did not improve from 1.80949
Epoch 8/100
1/7 [===>..........................] - ETA: 0s - loss: 0.1959 - accuracy: 1.0000
Epoch 8: val_loss did not i

<keras.src.callbacks.History at 0x79639e9556f0>

Calculating the accuracy

In [None]:
# Load the best model
ranker_model = load_model(path_checkpoint)

# Predict and get top index for each prediction and calculate the models accuracy
preds = ranker_model.predict(X_test)
test_err = accuracy_score(tf.argmax(y_test, axis = 1),tf.argmax(preds, axis = 1))

# possible values of outcomes:
print('Accuracy of the best model is ', test_err)

Accuracy of the best model is  0.26


Calculating Mean Average Precision @ 3 (MAP@3), which is the used metric in the kaggle competition

In [None]:
top3_indices = np.argsort(preds, axis=1)[:, -3:][:, ::-1]

# Convert categorical labels to indices
y_true_indices = np.argmax(y_test, axis=1)

# Calculate Mean Average Precision @ 3 (MAP@3)
def map3(actual, predicted, k=3):

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p == actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if np.isnan(score) or np.isnan(num_hits):
        return 0.0

    return score

map3_score = np.mean([map3(actual, predicted, k=3) for actual, predicted in zip(y_true_indices, top3_indices)])
print("MAP@3:", map3_score)

MAP@3: 0.43


# TODO
- Improve the neural network architecture (avoid overfitting)
- Improve the embedding extraction method
- Try PCA

# References

* https://huggingface.co/datasets/graelo/wikipedia/viewer/20230601.en

* https://www.kaggle.com/code/cdeotte/how-to-train-open-book-model-part-1/notebook

* https://www.kaggle.com/code/cdeotte/how-to-train-open-book-model-part-2/input

@inproceedings{beltagy-etal-2019-scibert,
    title = "SciBERT: A Pretrained Language Model for Scientific Text",
    author = "Beltagy, Iz  and Lo, Kyle  and Cohan, Arman",
    booktitle = "EMNLP",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D19-1371"
}