# DTC Zoomcamp Q&A

## Objective: The primary goal is to develop a model that can accurately match a 
## given question to its correct answer using the provided dataset

In [1]:
!ls

 attachments	      dtc-zoomcamp-q-a-bert.ipynb   train_answers.csv
'BertQ&A.ipynb'       test_answers.csv		    train_questions.csv
 bert-starter.ipynb   test_questions.csv


In [2]:
import pandas as pd

In [3]:
# using the pandas to read data
df_trainQue = pd.read_csv('train_questions.csv')
df_trainAns = pd.read_csv('train_answers.csv')
df_testQue = pd.read_csv('test_questions.csv')
df_testAns = pd.read_csv('test_answers.csv')

In [4]:
# retrieve a question and its corresponding answer from training datasets
question_index_to_lookup = 0

question_text = df_trainQue.iloc[question_index_to_lookup]['question']

corresponding_answer = df_trainAns.iloc[question_index_to_lookup]['answer']

print("Question:", question_text)
print("Answer:", corresponding_answer)

Question: For categorical target set, where the distribution is imbalanced (for example, 90/10) what approach should be used?
Answer: Alexey
Should we use something non-standard there or can we just go with the usual things we learned in the course?
Hamed
You just need to test different strategies. Something I noticed – if you have so many parse subclasses in your categorical [inaudible], you should be careful about using one-hot encoding. You might say you can use ordinal encoding, if your data in nature had some order. It will be useful. In my particular data, I couldn't have domain knowledge. I didn't know what the subclasses were, so I couldn't decide which strategy I should choose. But if you have the domain knowledge, that’s the key here, I think.


In [5]:
# Merge training data (df_trainQue and df_trainAns)
merged_df_train = pd.merge(df_trainQue, df_trainAns, on='answer_id', how='inner', suffixes=('_question', '_answer'))

In [6]:
# Drop duplicates
merged_df_train = merged_df_train.drop_duplicates()

In [7]:
merged_df_train.shape

(397, 10)

In [8]:
merged_df_train.head().T

Unnamed: 0,0,1,2,3,4
question_id,79062,468946,968800,688404,63921
question,"For categorical target set, where the distribu...",Is there anything that we are not allowed to u...,I have been catching up and have been doing ho...,Could you please explain what code we should l...,Is it just me or does the model have really ba...
course_question,Machine Learning Zoomcamp,Machine Learning Zoomcamp,Data Engineering Zoomcamp,Data Engineering Zoomcamp,Machine Learning Zoomcamp
year_question,2021,2021,2022,2022,2021
candidate_answers,156400754877105368643810912439,641330634887912439425941642829,9540161678567591936798838013,1986616298986865773699141765,754877604487912439858915425941
answer_id,156400,634887,954016,3699,858915
answer,Alexey\nShould we use something non-standard t...,"No, I don't think there is anything you cannot...","Alexey\nYes, you will be. You can submit the p...",Alexey\nI think the question refers to the hom...,"Dmitry\nIt's fine, because this is the showcas..."
course_answer,Machine Learning Zoomcamp,Machine Learning Zoomcamp,Data Engineering Zoomcamp,Data Engineering Zoomcamp,Machine Learning Zoomcamp
year_answer,2021,2021,2022,2022,2021
attachments_files,,,,,


In [9]:
# summary of the DataFrame
merged_df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397 entries, 0 to 398
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   question_id        397 non-null    int64 
 1   question           397 non-null    object
 2   course_question    397 non-null    object
 3   year_question      397 non-null    int64 
 4   candidate_answers  397 non-null    object
 5   answer_id          397 non-null    int64 
 6   answer             397 non-null    object
 7   course_answer      397 non-null    object
 8   year_answer        397 non-null    int64 
 9   attachments_files  25 non-null     object
dtypes: int64(4), object(6)
memory usage: 34.1+ KB


In [10]:
# Load BERT tokenizer and model
from transformers import BertTokenizer, BertModel
import warnings
warnings.filterwarnings("ignore")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased') 

In [11]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [12]:
import torch

def get_bert_embeddings(text):
    """
    Function for getting text embeddings using BERT
    Returns the logits for sequence classification
    """
    # Step 1: Tokenize the text
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    tokens = {key: value for key, value in tokens.items()}

    # Pass the tokenized input through the BERT model
    with torch.no_grad():
        outputs = model(**tokens)

    # Convert the tensor to a NumPy array
    return outputs['last_hidden_state'][0].mean(dim=0).cpu().numpy()

In [13]:
# Get questions and answers embeddings for the train part
train_question_emb = merged_df_train['question'].apply(get_bert_embeddings)
train_answer_emb = merged_df_train['answer'].apply(get_bert_embeddings)

In [14]:
# standardizing the embeddings of training questions and answers
from sklearn.preprocessing import StandardScaler

import numpy as np

scaler = StandardScaler()
train_question_emb_std = scaler.fit_transform(np.array(train_question_emb.tolist()))
train_answer_emb_std = scaler.transform(np.array(train_answer_emb.tolist()))

In [15]:
# func to find the best answer for each question based on their similarity
from sklearn.metrics.pairwise import cosine_similarity

def get_predictions(df_questions, df_answers):
    """
    Function that finds the best answer to each question according to their similarity.
    """
    predicted_answer_ids = []
    predicted_answers = []

    # Input Extraction
    for index, row in df_questions.iterrows():
        question_text = row['question']
        candidate_answer_ids = [int(answer_id) for answer_id in row['candidate_answers'].split(",")]

        # Getting questions embeddings
        question_embedding = get_bert_embeddings(question_text)
        question_embedding_standardized = scaler.transform(question_embedding.reshape(1, -1))
        

        # Getting answer candidate embeddings
        candidate_answers_df = df_answers[df_answers['answer_id'].isin(candidate_answer_ids)]
        candidate_answer_embeddings = candidate_answers_df['answer'].apply(get_bert_embeddings)
        candidate_answer_embeddings_standardized = scaler.transform(np.array(candidate_answer_embeddings.tolist()))

        # Calculating similarity between question and answers embeddings
        similarities = cosine_similarity(question_embedding_standardized, candidate_answer_embeddings_standardized).flatten()

        # Taking index of the best answer candidate
        best_answer_index = similarities.argmax()

        predicted_answer_id = candidate_answers_df.iloc[best_answer_index]['answer_id']
        predicted_answer = candidate_answers_df.iloc[best_answer_index]['answer']
        
        predicted_answer_ids.append(predicted_answer_id)
        predicted_answers.append(predicted_answer)

    return predicted_answer_ids, predicted_answers

In [16]:
# DataFrame combination of information from training question dataset 
train_predictions_df = pd.DataFrame({
    'question_id': df_trainQue['question_id'],     
    'question': df_trainQue['question'],
    'candidate_answers': df_trainQue['candidate_answers'],
    'answer_id': df_trainQue['answer_id'],
})

In [17]:
train_predictions_df.head()

Unnamed: 0,question_id,question,candidate_answers,answer_id
0,79062,"For categorical target set, where the distribu...",156400754877105368643810912439,156400
1,468946,Is there anything that we are not allowed to u...,641330634887912439425941642829,634887
2,968800,I have been catching up and have been doing ho...,9540161678567591936798838013,954016
3,688404,Could you please explain what code we should l...,1986616298986865773699141765,3699
4,63921,Is it just me or does the model have really ba...,754877604487912439858915425941,858915


In [18]:
# add predicted answer IDs & answers to train_predictions_df DataFrame
train_predictions_df['predicted_answer_id'], train_predictions_df['predicted_answer'] = \
    get_predictions(df_trainQue, df_trainAns)

In [19]:
train_predictions_df.head()

Unnamed: 0,question_id,question,candidate_answers,answer_id,predicted_answer_id,predicted_answer
0,79062,"For categorical target set, where the distribu...",156400754877105368643810912439,156400,156400,Alexey\nShould we use something non-standard t...
1,468946,Is there anything that we are not allowed to u...,641330634887912439425941642829,634887,634887,"No, I don't think there is anything you cannot..."
2,968800,I have been catching up and have been doing ho...,9540161678567591936798838013,954016,954016,"Alexey\nYes, you will be. You can submit the p..."
3,688404,Could you please explain what code we should l...,1986616298986865773699141765,3699,3699,Alexey\nI think the question refers to the hom...
4,63921,Is it just me or does the model have really ba...,754877604487912439858915425941,858915,858915,"Dmitry\nIt's fine, because this is the showcas..."


In [20]:
# accuracy of the model on the training set by comparing the predicted answer IDs with the true answer IDs
from sklearn.metrics import accuracy_score


true_labels = df_trainQue['answer_id']
predicted_labels = train_predictions_df['predicted_answer_id']

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)

print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.7330


In [21]:
test_questions_df = df_testQue.drop_duplicates(subset='question_id')
test_questions_df.shape

(514, 5)

In [22]:
# Creating the dataframe for the test part
test_predictions_df = pd.DataFrame({
    'question_id': test_questions_df['question_id'], 
})
test_predictions_df['predicted_answer_id'], test_predictions_df['predicted_answer'] = \
    get_predictions(test_questions_df, df_testAns)

In [29]:
test_predictions_df[['question_id', 'predicted_answer_id']].to_csv('BERTQnA_submission.csv', index=False)
test_predictions_df.head()

Unnamed: 0,question_id,predicted_answer_id,predicted_answer
0,707,767296,Alexey\nProbably more than you want to put in....
1,534450,231208,"Yes… and no? Sometimes, yeah. I wouldn't say o..."
2,996163,816559,Alexey\nYou can create a Python path variable ...
3,860215,988549,"Again, you’ll probably hate me soon for saying..."
4,980124,384381,Alexey\nThe first thing about the dataset – wh...


zsh:1: command not found: kaggle
