In [1]:
import os
import math
import nltk
nltk.download('stopwords')
import re
from nltk import WordPunctTokenizer
from nltk.stem.porter import *
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/giwe7005/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# preprocess textual input --> tokens

def tokenize(string_input):
     string_input = re.sub(r'\W+', ' ', string_input)
     string_input = string_input.lower()
     tk = WordPunctTokenizer()
     tokens = tk.tokenize(string_input)
     stop_words = set(stopwords.words('english'))
     stemmer = PorterStemmer()
     clean_tokens = []
     for token in tokens:
          if token.isalpha() and token not in stop_words:
               clean_tokens.append(stemmer.stem(token))
     return clean_tokens

tokenize("When did Alessandro Volta improve  and popularize the electrophorus???")

['alessandro', 'volta', 'improv', 'popular', 'electrophoru']

In [None]:
# CREATE CORPUS - extract files and create a dictionary of each file and their contents 
# {key = document_id: value = tokenised contents of document}

import json

def create_corpus():
    corpus = dict()
    for file in os.listdir(path="group_project/text_data"):
            # take only files with .clean suffix
            if file.endswith(".clean"):
                f = open(os.path.join("group_project/text_data", file), encoding="latin-1")
                file_name = file.strip(".txt.clean")
                # gives the contents of each document just in string form
                file_contents = f.read()
                # tokenize contents of files
                tokens = tokenize(file_contents)
                # assign tokenize contents to file name entry in corpus
                corpus[file_name] = tokens
    return corpus

# create corpus 
corpus = create_corpus()

len(corpus)

# print example of corpus entry
corpus["S08_set4_a8"]
corpus["S09_set4_a8"]
corpus["S08_set1_a1"]




In [4]:
# create vocabulary, i.e. unique set of tokens that occur across all documents in corpus

def create_vocabulary(corpus):
     vocabulary = set()
     for file in corpus:
          for token in corpus[file]:
               # add token to vocabulary set 
               vocabulary.add(token)
     return sorted(vocabulary) # must be in a particular order 

def is_clean_token(token):
    return re.match(r"^[a-zA-Z\-]+$", token) is not None

# preprocess textual input --> tokens

create_vocab = create_vocabulary(corpus) # create unique set of terms that occur across all documents, n = 42,729

vocabulary = [token for token in create_vocab if is_clean_token(token)] # clean out strange vocabulary items

len(vocabulary)

27374

In [None]:
# create document frequency dictionary - number of documents a given token appears in
# currently takes 3m 35s to run

document_frequency = {token: 0 for token in vocabulary}

for token in vocabulary:
    for document in corpus:
     if token in corpus[document]:
          document_frequency[token] += 1


document_frequency


In [6]:
# output document frequency to json file

import json

with open('document_frequency.json', 'w') as json_file:
    json.dump(document_frequency, json_file)


In [None]:
# alternative code (to speed things up): load document frequency from json file

with open('document_frequency.json', 'r') as json_file:
    document_frequency = json.load(json_file)

#document_freq_df = pd.DataFrame(list(data.items()), columns=['word', 'count'])

document_frequency

In [8]:
# pre-compute idf scores using document frequency dictionary

def calculate_idf(term, corpus_dictionary):
     """calculate inverted document frequency"""
     idf = math.log(len(corpus_dictionary)/((document_frequency[term])+1)) # can replace w/frequency count 
     return idf


idf_scores = {}

for item in vocabulary:
     if item not in idf_scores:
          idf_scores[item] = calculate_idf(item, corpus)
          #print(idf_scores[item])


# turn corpus into pandas dataframe
idf_df_scores = pd.DataFrame.from_dict(idf_scores, orient='index', columns=['IDF Score'])

idf_df_scores



Unnamed: 0,IDF Score
aaa,4.317488
aafc,4.317488
aag,4.317488
aalto,3.912023
aaltonen,4.317488
...,...
zwischen,3.912023
zwoll,4.317488
zygomat,3.912023
zygoptera,4.317488


In [9]:
from collections import Counter
# TERM FREQUENCY MATRIX - PER DOCUMENT
# create term-document-matrix - every document converted to a vector corresponding to frequency of each vocab item appearing in that document

def term_document_matrix(corpus, vocab):
       data = {}
       for doc_id, tokens in corpus.items():
              freqs = Counter(tokens)
              for term, count in freqs.items():
                     if term in vocab:
                            if term not in data:
                                   data[term] = {}
                            data[term][doc_id] = count
       df = pd.DataFrame.from_dict(data, orient='index').fillna(0).astype(int)  
       return df


term_document_matrix = term_document_matrix(corpus, vocabulary)

term_document_matrix

Unnamed: 0,S08_set4_a8,S09_set4_a8,S10_set4_a8,S10_set5_a5,S08_set3_a7,S10_set6_a3,S10_set6_a2,S09_set3_a4,S09_set1_a5,S08_set2_a7,...,S10_set2_a4,S08_set1_a9,S08_set1_a8,S09_set2_a4,S08_set2_a3,S09_set1_a1,S09_set3_a6,S09_set3_a2,S09_set4_a5,S08_set4_a5
amedeo,5,5,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
avogadro,33,33,30,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
caricatur,1,1,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
lorenzo,1,1,1,0,0,9,16,2,1,0,...,0,0,0,0,0,0,0,0,0,0
romano,1,1,1,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
uniondal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
cve,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
lha,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
peanut,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
# calculate tf values

def calculate_tf(term, document, term_document_matrix):
     """calculate term frequency"""
     tf = term_document_matrix[term][document]/len(document) # tf = vocab_index[term, document] / len(document)
     return tf



In [None]:
# gather doc ids
doc_ids = [file.strip(".txt.clean") for file in os.listdir(path="group_project/text_data") if file.endswith('.clean')]

# initialise pandas dataframe
tfidf_df = pd.DataFrame(0.0, index=vocabulary, columns=doc_ids) 

# create tf idf calculation vectors for each token in vocabulary, for each doc in corpus
def doc_vectorise(doc_ids, corpus, vocabulary):
    for doc in doc_ids:
        counts = Counter(corpus[doc])
        for term, freq in counts.items():
            if term in vocabulary:
                # calculate tf_idf
                term_document_matrix.loc[term, doc] = (freq / len(corpus[doc])) * idf_scores[term] # or use another tf scheme
    return term_document_matrix

tf_idf_docs_vector = doc_vectorise(doc_ids, corpus, vocabulary)


In [13]:
tf_idf_docs_vector

Unnamed: 0,S08_set4_a8,S09_set4_a8,S10_set4_a8,S10_set5_a5,S08_set3_a7,S10_set6_a3,S10_set6_a2,S09_set3_a4,S09_set1_a5,S08_set2_a7,...,S10_set2_a4,S08_set1_a9,S08_set1_a8,S09_set2_a4,S08_set2_a3,S09_set1_a1,S09_set3_a6,S09_set3_a2,S09_set4_a5,S08_set4_a5
amedeo,0.023293,0.023293,0.015667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
avogadro,0.153732,0.153732,0.156672,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
caricatur,0.004137,0.004137,0.000000,0.000725,0.000458,0.000743,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
lorenzo,0.003768,0.003768,0.004224,0.000000,0.000000,0.006088,0.018618,0.001482,0.001619,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
romano,0.004372,0.004372,0.004901,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001165,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
uniondal,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020807,0.0,0.0
cve,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010404,0.0,0.0
lha,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010404,0.0,0.0
peanut,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010404,0.0,0.0


In [None]:
# PROCESS USER QUERY

test_query = "Was Abraham Lincoln the sixteenth President of the United States?"

tokenize(test_query)

def vectorise_query(user_query, corpus_vector): # corpus vector = tf_idf dataframe
     corpus_vector["query_vector"] = pd.Series(0.0, index=corpus_vector.index)
     tokenized_query = tokenize(user_query)
     counts = Counter(tokenized_query)
     query_length = len(tokenized_query)
     for term, freq in counts.items():
          if term in corpus_vector.index:
               # calculate tf_idf
               corpus_vector["query_vector"][term] = freq/query_length * idf_scores[term] # or use another tf scheme
     return corpus_vector["query_vector"]


tf_idf_docs_vector.index

query_vector = vectorise_query(test_query, corpus_vector=tf_idf_docs_vector)

In [15]:
# COSINE SIMILARITY - method to compare the query with each document in the corpus

def calculate_cosine_similiarity(vocab_index, doc_index, query_scores):
     """calculate cosine similarity in order to compare query vector with document vector"""
     cosine_scores = {}
     query_scalar = np.sqrt(sum(vocab_index[query_scores] ** 2))
     for doc in doc_index:
          doc_scalar = np.sqrt(sum(vocab_index[doc] ** 2))
          dot_prod = sum(vocab_index[doc] * vocab_index[query_scores])
          cosine = (dot_prod / (query_scalar * doc_scalar))
          cosine_scores[doc] = cosine
     # sorted cosine scores by cosine score value
     sorted_cosine_scores = {key: value for key, value in sorted(cosine_scores.items(), key=lambda item: item[1], reverse=True)}
     return sorted_cosine_scores


cosine_scores = calculate_cosine_similiarity(tf_idf_docs_vector, doc_ids, 'query_vector')
cosine_scores

{'S08_set3_a4': 0.569014664937743,
 'S08_set3_a5': 0.08433460585753316,
 'S08_set3_a3': 0.05324577786464507,
 'S08_set3_a7': 0.04099654074145948,
 'S08_set3_a10': 0.03345029378342844,
 'S08_set3_a8': 0.03189705635185302,
 'S08_set3_a6': 0.027862153037531385,
 'S08_set3_a1': 0.02670772686750019,
 'S08_set3_a2': 0.023130640278709966,
 'S08_set3_a9': 0.01645035150762555,
 'S08_set2_a10': 0.01319870788669934,
 'S10_set3_a8': 0.01227799562181076,
 'S08_set2_a5': 0.01180678012546664,
 'S08_set2_a4': 0.011155391704735167,
 'S08_set4_a10': 0.010617516245987136,
 'S09_set4_a10': 0.010617516245987136,
 'S10_set4_a10': 0.008411108543030494,
 'S08_set2_a6': 0.007904285606592181,
 'S10_set3_a4': 0.006860133570075575,
 'S08_set2_a8': 0.005859058108080097,
 'S10_set3_a10': 0.005283473584768396,
 'S08_set4_a4': 0.005011270994477851,
 'S09_set4_a4': 0.005011270994477851,
 'S10_set4_a4': 0.004828706725192817,
 'S08_set2_a9': 0.00465042561506945,
 'S09_set1_a5': 0.004505955074164989,
 'S10_set3_a6': 0.00

In [29]:
# calculate cosine similarity values across all documents given query vector 
top_cosine = list(calculate_cosine_similiarity(tf_idf_docs_vector, doc_ids, 'query_vector'))[0:2]

top_cosine

['S09_set2_a1', 'S10_set2_a1']

In [None]:
# create subset of data for testing certain topics

import csv

# bring from different data subsets
s08 = pd.read_csv("group_project/S08_question_answer_pairs.txt", delimiter='\t', encoding = "latin-1")
s09 = pd.read_csv("group_project/S09_question_answer_pairs.txt", delimiter='\t', encoding = "latin-1")
s10 = pd.read_csv("group_project/S10_question_answer_pairs.txt", delimiter='\t', encoding = "latin-1")

chosen_cols = ['Topic', 'Question', 'Answer', 'DifficultyFromQuestioner', 'DifficultyFromAnswerer', 'CorrectArticleFile']

s08.columns = chosen_cols
s09.columns = chosen_cols
s10.columns = chosen_cols

pd_lst = [s08, s09, s10]

# merge into one questions list
questions_lst = pd.concat(pd_lst)


# take subset of questions for our sampling
subset = questions_lst[['Topic', 'Question', 'CorrectArticleFile']]

topics = ['kangaroo', 'Liechtenstein', 'John_Adams', 'Blaise_Pascal', 'Piano', 'London', 'English_Language', 'Pablo_Picasso']

selected_questions = subset[subset['Topic'].isin(topics)]


selected_questions # has columns 


#with open('test_questions.csv', 'w') as f:
   # selected_questions.to_csv(f)

selected_questions


Unnamed: 0,Topic,Question,CorrectArticleFile
859,John_Adams,Did John Adams represent the Continental Congr...,S08_set3_a1
860,John_Adams,Did John Adams represent the Continental Congr...,S08_set3_a1
861,John_Adams,Was Adams raised Congregationalist?,S08_set3_a1
862,John_Adams,Was Adams raised Congregationalist?,S08_set3_a1
863,John_Adams,Was Adams an opponent of the Stamp Act?,S08_set3_a1
...,...,...,...
1108,Piano,Why are upright pianos more compact?,S10_set2_a1
1109,Piano,Do older pianos have more keys than modern pia...,S10_set2_a1
1110,Piano,Do older pianos have more keys than modern pia...,S10_set2_a1
1111,Piano,What are the names of a piano's pedals?,S10_set2_a1


In [17]:
# get list of questions and their corresponding document ids

convert_to_dict = selected_questions.to_dict(orient='records')

selected_question_doc_pairs = {entry['Question']: entry['CorrectArticleFile'] for entry in convert_to_dict}

selected_question_doc_pairs

{'Did John Adams represent the Continental Congress in Europe?': 'S08_set3_a1',
 'Was Adams raised Congregationalist?': 'S08_set3_a1',
 'Was Adams an opponent of the Stamp Act?': 'S08_set3_a1',
 'When did Adams graduate from college?': 'S08_set3_a1',
 'Who was on the committee with Adams to draft  a Declaration of Independence?': 'S08_set3_a1',
 'What did Jefferson call John Adams?': 'S08_set3_a1',
 "What was Adams' political party?": 'S08_set3_a1',
 'Was Adams the first to introduce a bicameral legislature?': 'S08_set3_a1',
 'Did John Adams get along with Alexander Hamilton?': 'S08_set3_a1',
 'Did John Adams go to Harvard? ': 'S08_set3_a1',
 'Did John Adams support the Stamp Act of 1765?': 'S08_set3_a1',
 "Is Adams' birthplace part of a national park?": 'S08_set3_a1',
 'When did John Adams serve as Vice President?': 'S08_set3_a1',
 'With what party did Adams run for presidency?': 'S08_set3_a1',
 'Where is Adams buried?': 'S08_set3_a1',
 'Who were the midnight judges?': 'S08_set3_a1',


In [18]:
# compare output of tf_idf baseline with gold standard

queries = list(i for i in selected_question_doc_pairs.keys())

documents = list(selected_question_doc_pairs.values())

top_cosine_1= {}
top_cosines_2 = {}

for query, document in selected_question_doc_pairs.items():
    query_vector = vectorise_query(query, corpus_vector=tf_idf_docs_vector)
    cosine_scores = calculate_cosine_similiarity(tf_idf_docs_vector, doc_ids, 'query_vector')
    top_1 = list(cosine_scores)[0]
    top_2 = list(cosine_scores)[0:2]
    top_cosine_1[query] = top_1
    top_cosines_2[query] = top_2


# retrieve results@1
top_cosine_1_df = pd.DataFrame(top_cosine_1.items(), columns=["Question", "File Name"])
top_cosine_1_df.columns = ['Question', 'DocumentRetrieved']
    
# retrieve results@2
top_cosines_2_df = pd.DataFrame(top_cosines_2.items(), columns=["Question", "File Name"])
top_cosines_2_df.columns = ['Question', 'DocumentRetrieved']

with open('top_1.csv', 'w') as f:
    top_cosine_1_df.to_csv(f)

with open('top_2.csv', 'w') as f:
    top_cosines_2_df.to_csv(f)

In [19]:

#with open('documents_retrieved.csv', 'w') as f:
    #top_cosines_df.to_csv(f)

final_lst_1 = pd.merge(selected_questions, top_cosine_1_df)

final_lst_2 = pd.merge(selected_questions, top_cosines_2_df)

final_lst_1['Match'] = final_lst_1.apply(lambda row: 1 if row['CorrectArticleFile'] == row['DocumentRetrieved'] else 0, axis=1)

final_lst_2['Match'] = final_lst_2.apply(lambda row: 1 if row['CorrectArticleFile'] in row['DocumentRetrieved'] else 0, axis=1)

with open('final_results_1.csv', 'w') as f:
    final_lst_1.to_csv(f)

with open('final_results_2.csv', 'w') as f:
    final_lst_2.to_csv(f)


In [26]:
# alt code: load evaluation matrix with manually updated values (since duplicate files are in the data)
import pandas as pd

with open('final_results_1.csv') as f:
    retrieval_1 = pd.read_csv(f, index_col=0)

retrieval_1

with open('final_results_2.csv') as f:
    retrieval_2 = pd.read_csv(f, index_col=0)

retrieval_2

Unnamed: 0,Topic,Question,CorrectArticleFile,DocumentRetrieved,Match
0,John_Adams,Did John Adams represent the Continental Congr...,S08_set3_a1,"['S08_set3_a1', 'S08_set3_a2']",1
1,John_Adams,Did John Adams represent the Continental Congr...,S08_set3_a1,"['S08_set3_a1', 'S08_set3_a2']",1
2,John_Adams,Was Adams raised Congregationalist?,S08_set3_a1,"['S08_set3_a1', 'S08_set3_a2']",1
3,John_Adams,Was Adams raised Congregationalist?,S08_set3_a1,"['S08_set3_a1', 'S08_set3_a2']",1
4,John_Adams,Was Adams an opponent of the Stamp Act?,S08_set3_a1,"['S08_set3_a1', 'S08_set3_a2']",1
...,...,...,...,...,...
254,Piano,Why are upright pianos more compact?,S10_set2_a1,"['S09_set2_a1', 'S10_set2_a1']",1
255,Piano,Do older pianos have more keys than modern pia...,S10_set2_a1,"['S09_set2_a1', 'S10_set2_a1']",1
256,Piano,Do older pianos have more keys than modern pia...,S10_set2_a1,"['S09_set2_a1', 'S10_set2_a1']",1
257,Piano,What are the names of a piano's pedals?,S10_set2_a1,"['S09_set2_a1', 'S10_set2_a1']",1


In [None]:
# measure precision using Match column in results df


#k@1
count_match_1 = retrieval_1['Match'].value_counts()[1]
count_fail_1 = retrieval_1['Match'].value_counts()[0]

precision_1 = count_match_1/(count_match_1+count_fail_1)

#k@2
count_match_2 = retrieval_2['Match'].value_counts()[1]
count_fail_2 = retrieval_2['Match'].value_counts()[0]

precision_2 = count_match_2/(count_match_2+count_fail_2)


print(f"Precision score k@1 is: {precision_1}")
print(f"Precision score k@2 is: {precision_2}")


Precision score k@1 is: 0.833976833976834
Precision score k@2 is: 0.8725868725868726
