In [1]:
import pandas as pd
# using the pandas to read data
df_trainQue = pd.read_csv('train_questions.csv')
df_trainAns = pd.read_csv('train_answers.csv')
df_testQue = pd.read_csv('test_questions.csv')
df_testAns = pd.read_csv('test_answers.csv')

In [2]:
# Merge training data (df_trainQue and df_trainAns)
merged_df_train = pd.merge(df_trainQue, df_trainAns, on='answer_id', how='inner', suffixes=('_question', '_answer'))

In [3]:
merged_df_train = merged_df_train.drop_duplicates()

In [8]:
merged_df_train['question'].nunique()

396

In [10]:
merged_df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397 entries, 0 to 398
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   question_id        397 non-null    int64 
 1   question           397 non-null    object
 2   course_question    397 non-null    object
 3   year_question      397 non-null    int64 
 4   candidate_answers  397 non-null    object
 5   answer_id          397 non-null    int64 
 6   answer             397 non-null    object
 7   course_answer      397 non-null    object
 8   year_answer        397 non-null    int64 
 9   attachments_files  25 non-null     object
dtypes: int64(4), object(6)
memory usage: 34.1+ KB


In [13]:
from sentence_transformers import SentenceTransformer,util
from tqdm.auto import tqdm

model = SentenceTransformer('all-MiniLM-L6-v2')

In [14]:
q_emb = model.encode(merged_df_train["question"].values,show_progress_bar=True)
ans_emb = model.encode(merged_df_train["answer"].values,show_progress_bar=True)

train_ans_dict = {}
for idx,(_,row) in enumerate(tqdm(merged_df_train.iterrows(),total=len(merged_df_train))):
    train_ans_dict[f"{row['answer_id']}"] = ans_emb[idx]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/397 [00:00<?, ?it/s]

In [17]:
import numpy as np
from sklearn.metrics import accuracy_score

merged_df_train["candidate_answers"] = merged_df_train["candidate_answers"].str.split(",")

preds = []
for idx,(_,row) in enumerate(tqdm(merged_df_train.iterrows(),total=len(merged_df_train))):
    if idx != -1:
        sim = []
        for ca in row["candidate_answers"]:
            cos_sim = util.cos_sim(q_emb[idx],train_ans_dict[f"{ca}"])
            sim.append(cos_sim.item())
        aidx = np.argmax(np.array(sim))
        preds.append(row["candidate_answers"][aidx])

  0%|          | 0/397 [00:00<?, ?it/s]

In [18]:
preds = np.array(preds)
accuracy = accuracy_score(merged_df_train.answer_id.values.ravel(),preds.astype(int).ravel())
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9295


In [19]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

df_testQue = pd.read_csv('test_questions.csv')
df_testAns = pd.read_csv('test_answers.csv')

In [22]:
import re
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

In [23]:
df_testQue = df_testQue.drop_duplicates(subset='question_id')
df_testQue['cleaned_question'] = df_testQue['question'].apply(clean_text)
df_testAns['cleaned_answer'] = df_testAns['answer'].apply(clean_text)

In [24]:
df_testQue["candidate_answers"] = df_testQue["candidate_answers"].str.split(",")


q_emb = model.encode(df_testQue["cleaned_question"].values,show_progress_bar=True)
ans_emb = model.encode(df_testAns["cleaned_answer"].values,show_progress_bar=True)

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

In [28]:
df_testAns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   answer_id          516 non-null    int64 
 1   answer             516 non-null    object
 2   course             516 non-null    object
 3   year               516 non-null    int64 
 4   attachments_files  8 non-null      object
 5   cleaned_answer     516 non-null    object
dtypes: int64(2), object(4)
memory usage: 24.3+ KB


In [34]:
from sklearn.metrics.pairwise import cosine_similarity

test_ans_dict = {}
for idx,(_,row) in enumerate(tqdm(df_testAns.iterrows(),total=len(df_testAns))):
    test_ans_dict[f"{row['answer_id']}"] = ans_emb[idx]

    
preds = []
for idx,(_,row) in enumerate(tqdm(df_testQue.iterrows(),total=len(df_testQue))):
    if idx != -1:
        sim = []
        for ca in row["candidate_answers"]:
            cos_sim = util.cos_sim(q_emb[idx],test_ans_dict[f"{ca}"])
            sim.append(cos_sim.item())
            
    
        aidx = np.argmax(np.array(sim))
        preds.append(row["candidate_answers"][aidx])

  0%|          | 0/516 [00:00<?, ?it/s]

  0%|          | 0/514 [00:00<?, ?it/s]

In [39]:
df_testQue["predicted_answer_id"] = preds
df_testQue.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 514 entries, 0 to 515
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   question_id          514 non-null    int64 
 1   question             514 non-null    object
 2   course               514 non-null    object
 3   year                 514 non-null    int64 
 4   candidate_answers    514 non-null    object
 5   cleaned_question     514 non-null    object
 6   predicted_answer_id  514 non-null    object
dtypes: int64(2), object(5)
memory usage: 32.1+ KB


In [46]:
correct_predictions = df_testQue.apply(lambda row: row["predicted_answer_id"] in row["candidate_answers"], axis=1)
accuracy_test = correct_predictions.mean()

print(f'Test Accuracy: {accuracy_test:.4f}')

Test Accuracy: 1.0000


In [50]:
df_testQue.shape

(514, 7)

In [51]:
df_testQue[["question_id","predicted_answer_id"]].to_csv("SentenceTransformer.csv",index=False)