## BERT MODEL for Yahoo QA

In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, pipeline, BertForQuestionAnswering
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv('Dataset/train.csv')
df_test = pd.read_csv('Dataset/test.csv')

In [3]:
df_train.head()

Unnamed: 0,class_index,question_title,question_content,best_answer
0,5,why doesn't an optical mouse work on a glass t...,or even on some surfaces?,Optical mice use an LED and a camera to rapidl...
1,6,What is the best off-road motorcycle trail ?,long-distance trail throughout CA,i hear that the mojave road is amazing!<br />\...
2,3,What is Trans Fat? How to reduce that?,I heard that tras fat is bad for the body. Wh...,Trans fats occur in manufactured foods during ...
3,7,How many planes Fedex has?,I heard that it is the largest airline in the ...,according to the www.fedex.com web site:\nAir ...
4,7,"In the san francisco bay area, does it make se...",the prices of rent and the price of buying doe...,renting vs buying depends on your goals. <br /...


In [4]:
df_classes = pd.read_csv("Dataset/classes.txt",header=None,names=["classes"])
df_classes.index = np.arange(1, len(df_classes)+1)
df_classes

Unnamed: 0,classes
1,Society & Culture
2,Science & Mathematics
3,Health
4,Education & Reference
5,Computers & Internet
6,Sports
7,Business & Finance
8,Entertainment & Music
9,Family & Relationships
10,Politics & Government


In [5]:
print(df_train.shape)
print(df_test.shape)

(1399999, 4)
(59999, 4)


In [6]:
missing_values = df_train.isna().sum()
missing_values

class_index              0
question_title           0
question_content    631688
best_answer          24596
dtype: int64

In [7]:
missing_values = df_test.isna().sum()
missing_values

class_index             0
question_title          0
question_content    27106
best_answer          1033
dtype: int64

In [8]:
# remove missing values 
df_train = df_train.dropna()
df_train = df_train.reset_index(drop=True)
df_train.shape

(753637, 4)

In [9]:
df_test = df_test.dropna()
df_test = df_test.reset_index(drop=True)
df_test.shape

(32265, 4)

In [10]:
# Renameing the class_index column to label
df_train.rename(columns={'class_index':'label'}, inplace=True)

# # renaming lables 
# for idx,cls in enumerate(df_classes["classes"],start=1):
#   df_train.loc[df_train["label"] == idx, "label"] = cls

df_train.head()

Unnamed: 0,label,question_title,question_content,best_answer
0,5,why doesn't an optical mouse work on a glass t...,or even on some surfaces?,Optical mice use an LED and a camera to rapidl...
1,6,What is the best off-road motorcycle trail ?,long-distance trail throughout CA,i hear that the mojave road is amazing!<br />\...
2,3,What is Trans Fat? How to reduce that?,I heard that tras fat is bad for the body. Wh...,Trans fats occur in manufactured foods during ...
3,7,How many planes Fedex has?,I heard that it is the largest airline in the ...,according to the www.fedex.com web site:\nAir ...
4,7,"In the san francisco bay area, does it make se...",the prices of rent and the price of buying doe...,renting vs buying depends on your goals. <br /...


In [11]:
df_train = df_train.drop(["question_content"], axis = 1)
df_train.head()

Unnamed: 0,label,question_title,best_answer
0,5,why doesn't an optical mouse work on a glass t...,Optical mice use an LED and a camera to rapidl...
1,6,What is the best off-road motorcycle trail ?,i hear that the mojave road is amazing!<br />\...
2,3,What is Trans Fat? How to reduce that?,Trans fats occur in manufactured foods during ...
3,7,How many planes Fedex has?,according to the www.fedex.com web site:\nAir ...
4,7,"In the san francisco bay area, does it make se...",renting vs buying depends on your goals. <br /...


In [12]:
# Split the dataset into train, validation, and test sets
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42)

In [13]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
nlp = pipeline("question-answering", model=model, tokenizer=tokenizer)

In [30]:
nlp({
    'question': df_test['question_title'][11],
    'context': df_test['best_answer'][11]
})

print("Question: ", df_test['question_title'][11])
print("Context: ", df_test['best_answer'][11])
# print answer
print("Answer: ",nlp({
    'question': df_test['question_title'][11],
    'context': df_test['best_answer'][11]
})['answer'])


Question:  What color is water really?
Context:  The absorption minimum for water is at 420 nm, well within the blue range of the visual spectrum. This means that that is the wavelength of light that it reflects the most. It does not reflect it very much, so small amounts appear transparent.
Answer:  transparent.


### the above was by making use of simple one word QA for the given context 

In [16]:
# create a dataframe with 5 rows of data

# df = pd.DataFrame(columns=['label','question_title', 'best_answer'])
# df['label'] = df_train['label'].head(500)
# df['question_title'] = df_train['question_title'].head(500)
# df['best_answer'] = df_train['best_answer'].head(500)


In [17]:
# df_train.head()

In [18]:
# Tokenize and preprocess the dataset
# def tokenize_dataset(df):
#     input_ids = []
#     attention_masks = []
#     start_positions = []
#     end_positions = []

#     for index, row in df.iterrows():
#         question = row['question_title']
#         context = row['best_answer']

#          # Tokenize question and context
#         encoding = tokenizer.encode_plus(question, context, max_length=512, pad_to_max_length=True, return_tensors="pt", truncation=True)
#         input_ids.append(encoding["input_ids"].squeeze())  # Remove the extra dimension
#         attention_masks.append(encoding["attention_mask"].squeeze())  # Remove the extra dimension

#         # Placeholder for start and end positions
#         start_positions.append(0)
#         end_positions.append(0)

#     input_ids = torch.stack(input_ids)
#     attention_masks = torch.stack(attention_masks)
#     start_positions = torch.tensor(start_positions)
#     end_positions = torch.tensor(end_positions)

#     return TensorDataset(input_ids, attention_masks, start_positions, end_positions)


In [19]:
# import logging
# # logging.disable(logging.WARNING)
# # Create data loaders
# train_dataset = tokenize_dataset(df)
# train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [20]:
# train_dataset[0]

In [21]:
# for batch in train_dataloader:
#     print(batch[0].shape)
#     print(batch[1].shape)
#     print(batch[2].shape)
#     print(batch[3].shape)
#     break

In [22]:
# torch.cuda.device_count()

In [23]:
# # Define loss function (CrossEntropyLoss)
# loss_fn = torch.nn.CrossEntropyLoss()

# # Define optimizer (AdamW)
# optimizer = AdamW(model.parameters(), lr=2e-5)

# # Training loop
# num_epochs = 5

# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0

#     for batch in train_dataloader:
#         input_ids_batch, attention_mask_batch, start_positions_batch, end_positions_batch = batch

#         optimizer.zero_grad()
        
#         # Pass the entire batch to the model
#         outputs = model(
#             input_ids=input_ids_batch,
#             attention_mask=attention_mask_batch,
#             start_positions=start_positions_batch,
#             end_positions=end_positions_batch
#         )
        
#         start_logits = outputs.start_logits
#         end_logits = outputs.end_logits

#         # Calculate the loss for both start and end positions
#         start_loss = loss_fn(start_logits, start_positions_batch)
#         end_loss = loss_fn(end_logits, end_positions_batch)
#         loss = start_loss + end_loss

#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#     avg_loss = total_loss / len(train_dataloader)
#     print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")



In [24]:
# # Input question
# question = "What is the capital of France?"

# # Tokenize the input
# inputs = tokenizer(question, return_tensors="pt")

# # Perform inference to predict the answer span
# start_logits = outputs.start_logits
# end_logits = outputs.end_logits

# # Get the start and end indices with the highest logits as the answer
# start_index = torch.argmax(start_logits, dim=1).item()
# end_index = torch.argmax(end_logits, dim=1).item()

# # Retrieve the answer span from the question
# answer = tokenizer.decode(inputs["input_ids"][0][start_index:end_index+1])

# print("Predicted Answer:", answer)

In [25]:
# jjj
# # Tokenize input texts
# train_encodings = tokenizer(list(df_train['question_title']), list(df_train['best_answer']), truncation=True, padding=True, max_length=128)
# val_encodings = tokenizer(list(df_val['question_title']), list(df_val['best_answer']), truncation=True, padding=True, max_length=128)
# test_encodings = tokenizer(list(df_test['question_title']), list(df_test['best_answer']), truncation=True, padding=True, max_length=128)

In [26]:
# # Create DataLoader for efficient batching
# train_dataset = TensorDataset(
#     torch.tensor(train_encodings['input_ids']),
#     torch.tensor(train_encodings['attention_mask']),
#     torch.tensor(df_train['label'].tolist())
# )
# train_sampler = RandomSampler(train_dataset)
# train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=16)