# Question-Answer: Transformers

In [2]:
import numpy as np
import pandas as pd
import os
import re
import string

In [3]:
# Read data

df1 = pd.read_csv(r'/kaggle/input/questionanswer-dataset/S08_question_answer_pairs.txt',encoding='latin-1',sep='\t')
df2 = pd.read_csv(r'/kaggle/input/questionanswer-dataset/S09_question_answer_pairs.txt',encoding='latin-1',sep='\t')
df3 = pd.read_csv(r'/kaggle/input/questionanswer-dataset/S10_question_answer_pairs.txt',encoding='latin-1',sep='\t')
df = pd.concat([df1,df2,df3],ignore_index=True)
df.head()

Unnamed: 0,ï»¿ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile,ArticleTitle
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4,
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4,
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4,
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4,
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4,


In [4]:
print("Number of records: ",len(df))

Number of records:  3998


In [5]:
# Processing Dataframe to keep relevant columns only

df = df[['Question','Answer','ArticleFile']]
df.head()

Unnamed: 0,Question,Answer,ArticleFile
0,Was Abraham Lincoln the sixteenth President of...,yes,S08_set3_a4
1,Was Abraham Lincoln the sixteenth President of...,Yes.,S08_set3_a4
2,Did Lincoln sign the National Banking Act of 1...,yes,S08_set3_a4
3,Did Lincoln sign the National Banking Act of 1...,Yes.,S08_set3_a4
4,Did his mother die of pneumonia?,no,S08_set3_a4


In [6]:
# Check duplicate questions and remove them

print(f"Number of duplicate questions: {len(df) - df['Question'].nunique()}")

df.drop_duplicates(subset=['Question'],inplace=True)
print("\nNumber of records after removing duplicates: ",len(df))

Number of duplicate questions: 1542

Number of records after removing duplicates:  2457


In [7]:
# Check null values and remove them
df.isnull().sum()

Question         1
Answer         272
ArticleFile      2
dtype: int64

In [8]:
df.dropna(subset=['Question'],inplace=True)
df.dropna(subset=['Answer'],inplace=True)
df.dropna(subset=['ArticleFile'],inplace=True)
df.reset_index(drop=True,inplace=True)
print("Number of records after removing blanks: ",len(df))

Number of records after removing blanks:  2183


In [9]:
# Format the answer column

def format_column(text):
    text = text.lower()
    text = text.translate(str.maketrans('','',string.punctuation)) # remove punctuations
    return text

df['Answer'] = df.loc[:,'Answer'].apply(lambda x:format_column(x))
df.head()

Unnamed: 0,Question,Answer,ArticleFile
0,Was Abraham Lincoln the sixteenth President of...,yes,S08_set3_a4
1,Did Lincoln sign the National Banking Act of 1...,yes,S08_set3_a4
2,Did his mother die of pneumonia?,no,S08_set3_a4
3,How many long was Lincoln's formal education?,18 months,S08_set3_a4
4,When did Lincoln begin his political career?,1832,S08_set3_a4


In [10]:
# Prepare context

folder_path = '/kaggle/input/questionanswer-dataset/text_data/text_data'
context_articles = []
for i in range(len(df)):
    article_path = str(df.loc[i,'ArticleFile']) + '.txt.clean'
    with open(os.path.join(folder_path,article_path),'r',encoding='latin-1') as f:
        text = f.read().replace('\n','')
        context_articles.append(text)
        f.close()

# Transformers: Pretrained Pipeline

In [10]:
from transformers import pipeline
qa = pipeline('question-answering',model='deepset/roberta-base-squad2')

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [11]:
def ques_ans_context(num):
    """The function will return the answer
    by reading the context and the question
    using the transformers pipeline model"""
    question = df['Question'][num]
    context = context_articles[num]
    print("Question: ",question)
    print("Original Answer: ",df['Answer'][num])
    return qa(context=context,question=question)

In [12]:
ques_ans_context(140)

Question:  Do all ducks "quack"?
Original Answer:  no


{'score': 0.3358309864997864,
 'start': 4847,
 'end': 4917,
 'answer': 'most ducks other than female Mallards and domestic ducks do not "quack'}

In [13]:
ques_ans_context(190)

Question:  What did Aristotle say about elephants?
Original Answer:  the beast which pass08seth all others in wit and mind


{'score': 0.19535286724567413,
 'start': 1638,
 'end': 1691,
 'answer': '"the beast which passeth all others in wit and mind."'}

In [14]:
ques_ans_context(1000)

Question:  Could Blaise Pascal move without crutches?
Original Answer:  he could move without crutches until a paralytic attack in 1647


{'score': 0.3690100610256195,
 'start': 14879,
 'end': 14913,
 'answer': 'he could not move without crutches'}

In [15]:
# Running the Transformers model on a sample of data and checking scores and similarity

df_copy = df.copy()
df_copy['Context'] = context_articles

df_sample = df_copy.sample(50)
df_sample.reset_index(drop=True,inplace=True)

scores = []
answers = []
for i in range(len(df_sample)):
    question = df_sample.loc[i,'Question']
    context = df_sample.loc[i,'Context']
    answers.append(qa(context = context,question=question)['answer'])
    scores.append(qa(context=context,question=question)['score'])
    
df_sample['Model_answer'] = answers
df_sample['Model_score'] = scores

df_sample.head()

Unnamed: 0,Question,Answer,ArticleFile,Context,Model_answer,Model_score
0,Is it true that Volta married the daughter of ...,yes,S09_set4_a10,Alessandro_VoltaCount Alessandro Giuseppe Anto...,Teresa,0.786386
1,Could Malay have originated from Sumatra island?,yes,S10_set5_a2,Malay_languageMalay is a group of languages cl...,Many roots have come virtually unchanged from ...,0.211025
2,How long was Alessandro Volta a professor at t...,alessandro volta was a professor at the univer...,S10_set4_a10,Alessandro_VoltaCount Alessandro Giuseppe Anto...,almost 25 years,0.495419
3,Is santiago Spanish for St. James?,yes,S09_set3_a6,Santiago Santiago is Spanish for St. James (fr...,Santiago Santiago,0.343682
4,Have coleopterists formed organisations to fac...,yes,S08_set1_a8,beetleBeetles are a group of insects which hav...,Coleopterists have formed organisations,0.478224


In [24]:
# Computing similarity

# Filter out the Answers with "yes" and "no"
df_sim = df_sample[(df_sample['Answer'] != 'yes')& (df_sample['Answer']!= 'no')]
df_sim.loc[:,'Model_answer'] = df_sim['Model_answer'].apply(lambda x:x.lower())
df_sim.reset_index(drop=True,inplace=True)
df_sim.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sim.loc[:,'Model_answer'] = df_sim['Model_answer'].apply(lambda x:x.lower())


Unnamed: 0,Question,Answer,ArticleFile,Context,Model_answer,Model_score
0,How long was Alessandro Volta a professor at t...,alessandro volta was a professor at the univer...,S10_set4_a10,Alessandro_VoltaCount Alessandro Giuseppe Anto...,almost 25 years,0.495419
1,Did he not determine the dependence of the boi...,yes he did,S08_set4_a5,Anders_CelsiusAnders CelsiusThe observatory of...,he determined the dependence of the boiling of...,1e-06
2,What was the Dead or Alive 4 fighting arena mo...,a magnetic accelerator cannon station from halo 2,S09_set3_a2,Nassau __NOTOC__Nassau may mean the following:...,magnetic accelerator cannon station from halo 2,0.332576
3,What language do they speak in Kuala Lumpur?,bahasa melayu,S10_set3_a3,Kuala_LumpurKuala Lumpur (Jawi: ÙÙØ§ÙØ§ ÙÙ...,malay,0.861275
4,Who achieved international fame as the leading...,grant,S08_set3_a5,"Ulysses S. Grant Ulysses S. Grant, See milita...",ulysses s. grant,0.430665


In [27]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=cbb27c2d6848ad7101b8ffb80dc909bb2142394df9983cce35315f9a7c4b2301
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [29]:
from sentence_transformers import CrossEncoder
model = CrossEncoder('cross-encoder/qnli-distilroberta-base', max_length=512)

cos_sim = []
for i in range(len(df_sim)):
    scores = model.predict([(df_sim.loc[i,'Answer'], df_sim.loc[i,'Model_answer'])])
    cos_sim.append(scores)
    
df_sim['Cosine_similarity'] = cos_sim
df_sim.head()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sim['Cosine_similarity'] = cos_sim


Unnamed: 0,Question,Answer,ArticleFile,Context,Model_answer,Model_score,Cosine_similarity
0,How long was Alessandro Volta a professor at t...,alessandro volta was a professor at the univer...,S10_set4_a10,Alessandro_VoltaCount Alessandro Giuseppe Anto...,almost 25 years,0.495419,[0.9190614]
1,Did he not determine the dependence of the boi...,yes he did,S08_set4_a5,Anders_CelsiusAnders CelsiusThe observatory of...,he determined the dependence of the boiling of...,1e-06,[0.21494205]
2,What was the Dead or Alive 4 fighting arena mo...,a magnetic accelerator cannon station from halo 2,S09_set3_a2,Nassau __NOTOC__Nassau may mean the following:...,magnetic accelerator cannon station from halo 2,0.332576,[0.60717815]
3,What language do they speak in Kuala Lumpur?,bahasa melayu,S10_set3_a3,Kuala_LumpurKuala Lumpur (Jawi: ÙÙØ§ÙØ§ ÙÙ...,malay,0.861275,[0.57108784]
4,Who achieved international fame as the leading...,grant,S08_set3_a5,"Ulysses S. Grant Ulysses S. Grant, See milita...",ulysses s. grant,0.430665,[0.586542]


In [30]:
print('Average cosine similarity: ',np.mean(cos_sim))

Average cosine similarity:  0.53491795
