<a href="https://colab.research.google.com/github/csabi0312/DeepLProject/blob/main/embedding_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Packages

In [None]:
import pandas as pd
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense
import numpy as np
from tensorflow.keras.utils import to_categorical


# Setting the random seed
seed_value = 42
random.seed(seed_value)

#Data manipulations

In [None]:
#Loading the questions
data = pd.read_csv("https://raw.githubusercontent.com/csabi0312/DeepLProject/main/train.csv",index_col=0)

# Creating a dictionary to map the values to numbers
mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}

# Replacing the values in the 'answer' column
data['answer'] = data['answer'].replace(mapping)
data.head()

Unnamed: 0_level_0,prompt,A,B,C,D,E,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,3
1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,0
2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,0
3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,2
4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,3


In [None]:
#Train-Val-Test split:
# Splitting the DataFrame into training, validation, and test datasets with a 2:1:1 ratio
train, temp = train_test_split(data, test_size=0.5, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

print(len(train))
print(len(val))
print(len(test))

100
50
50


The following code snippet acquires embeddings for the questions and possible answers using sciBERT

In [None]:
# Load the SciBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

# Function to get embeddings for the prompts and answers
def get_embeddings(prompt, answers):
    embeddings = []

    # Get embedding for the question
    prompt_embedding = get_embedding(prompt)
    embeddings.append(prompt_embedding)

    # Get embeddings for each answer
    for ans in answers:
        answer_embedding = get_embedding(ans)
        embeddings.append(answer_embedding)

    return torch.stack(embeddings)

# Function to get embedding for a text
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

In [None]:
df = train.copy()

# Get embeddings for prompts and answers
X = []
for _, row in df.iterrows():
    prompt = row['prompt']
    answers = [row['A'], row['B'], row['C'], row['D'], row['E']]

    embeddings = get_embeddings(prompt, answers)
    X.append(embeddings)

X = torch.stack(X)
X = X.view(-1, 6, 768)  # Reshape the tensor to the desired shape

# Convert PyTorch tensor to NumPy array
X_np = X.cpu().numpy()

# Making a categorical y
y = train['answer']
y_train = to_categorical(y, 5)

In [None]:
# Define a simple ranking neural network using Keras
input_layer = Input(shape=(6, 768))
flatten_layer = Flatten()(input_layer)
dense_1 = Dense(128, activation='relu')(flatten_layer)
output_layer = Dense(5, activation='softmax')(dense_1)

# Create the model
ranker_model = Model(inputs=input_layer, outputs=output_layer)
ranker_model.compile(optimizer='adam', loss='categorical_crossentropy')  # Use appropriate optimizer and loss

# Train the model
ranker_model.fit(X_np, y_train, epochs=20, batch_size=16)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1ff28517090>

In [None]:
# Predict and get top 3 indices for each prediction
predictions = ranker_model.predict(X_np)
top3_indices = np.argsort(predictions, axis=1)[:, -3:][:, ::-1]

# Convert categorical labels to indices
y_true_indices = np.argmax(y_train, axis=1)



Calculating the accuracy

In [None]:
real = list(y)
l = []
pred = ranker_model.predict(X_np)
for p in pred:
    l.append(np.argmax(p))

correct, wrong = 0, 0
for i in range(len(l)):
    if l[i] == real[i]:
        correct+=1
    else:
        wrong+=1
print(correct, wrong)

100 0


Calculating Mean Average Precision @ 3 (MAP@3), which is the used metric in the kaggle competition

In [None]:
# Calculate Mean Average Precision @ 3 (MAP@3)
def map3(actual, predicted, k=3):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p == actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if np.isnan(score) or np.isnan(num_hits):
        return 0.0

    return score

map3_score = np.mean([map3(actual, predicted, k=3) for actual, predicted in zip(y_true_indices, top3_indices)])
print("MAP@3:", map3_score)

MAP@3: 1.0


Evaluating on the test set (Accuracy and MAP@3)

In [None]:
df = test.copy()

# Get embeddings for prompts and answers
X = []
for _, row in df.iterrows():
    prompt = row['prompt']
    answers = [row['A'], row['B'], row['C'], row['D'], row['E']]

    embeddings = get_embeddings(prompt, answers)
    X.append(embeddings)

X = torch.stack(X)
X = X.view(-1, 6, 768)  # Reshape the tensor to the desired shape
X_np = X.cpu().numpy()

y = test['answer']
y_test = to_categorical(y, 5)

In [None]:
real = list(y)
l = []
pred = ranker_model.predict(X_np)
for p in pred:
    l.append(np.argmax(p))

correct, wrong = 0, 0
for i in range(len(l)):
    if l[i] == real[i]:
        correct+=1
    else:
        wrong+=1
print(correct, wrong)

15 35


In [None]:
predictions = ranker_model.predict(X_np)
top3_indices = np.argsort(predictions, axis=1)[:, -3:][:, ::-1]

# Convert categorical labels to indices
y_true_indices = np.argmax(y_test, axis=1)

map3_score = np.mean([map3(actual, predicted, k=3) for actual, predicted in zip(y_true_indices, top3_indices)])
print("MAP@3:", map3_score)

MAP@3: 0.4233333333333333


# TODO
- Improve the neural network architecture (avoid overfitting)
- Improve the embedding extraction method
- Try PCA