In [2]:
import os
import math
import nltk
nltk.download('stopwords')
import re
from nltk import WordPunctTokenizer
from nltk.stem.porter import *
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/giwe7005/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# preprocess textual input --> tokens

def tokenize(string_input):
     string_input = re.sub(r'\W+', ' ', string_input)
     string_input = string_input.lower()
     tk = WordPunctTokenizer()
     tokens = tk.tokenize(string_input)
     stop_words = set(stopwords.words('english'))
     stemmer = PorterStemmer()
     clean_tokens = []
     for token in tokens:
          if token.isalpha() and token not in stop_words:
               clean_tokens.append(stemmer.stem(token))
     return clean_tokens

tokenize("When did Alessandro Volta improve  and popularize the electrophorus???")

['alessandro', 'volta', 'improv', 'popular', 'electrophoru']

In [4]:
# CREATE CORPUS - extract files and create a dictionary of each file and their contents 
# {key = document_id: value = tokenised contents of document}

import json

def create_corpus():
    corpus = dict()
    for file in os.listdir(path="group_project/text_data"):
            # take only files with .clean suffix
            if file.endswith(".clean"):
                f = open(os.path.join("group_project/text_data", file), encoding="latin-1")
                file_name = file
                # gives the contents of each document just in string form
                file_contents = f.read()
                # tokenize contents of files
                tokens = tokenize(file_contents)
                # assign tokenize contents to file name entry in corpus
                corpus[file_name] = tokens
    return corpus

# create corpus 
corpus = create_corpus()

len(corpus)

# print example of corpus entry
corpus["S08_set4_a8.txt.clean"]
corpus["S09_set4_a8.txt.clean"]
corpus["S08_set1_a1.txt.clean"]




['kangaroo',
 'kangaroo',
 'marsupi',
 'famili',
 'macropodida',
 'macropod',
 'mean',
 'larg',
 'foot',
 'common',
 'use',
 'term',
 'use',
 'describ',
 'largest',
 'speci',
 'famili',
 'red',
 'kangaroo',
 'antilopin',
 'kangaroo',
 'eastern',
 'western',
 'grey',
 'kangaroo',
 'macropu',
 'genu',
 'famili',
 'also',
 'includ',
 'mani',
 'smaller',
 'speci',
 'includ',
 'wallabi',
 'tree',
 'kangaroo',
 'wallaroo',
 'pademelon',
 'quokka',
 'live',
 'speci',
 'kangaroo',
 'endem',
 'contin',
 'australia',
 'smaller',
 'macropod',
 'found',
 'australia',
 'new',
 'guinea',
 'gener',
 'larger',
 'kangaroo',
 'adapt',
 'much',
 'better',
 'chang',
 'wrought',
 'australian',
 'landscap',
 'human',
 'though',
 'mani',
 'smaller',
 'cousin',
 'endang',
 'plenti',
 'farm',
 'extent',
 'wild',
 'kangaroo',
 'shot',
 'meat',
 'controversi',
 'steve',
 'dow',
 'industri',
 'gun',
 'sydney',
 'morn',
 'herald',
 'onlin',
 'septemb',
 'kangaroo',
 'australian',
 'icon',
 'featur',
 'australian',

In [5]:
# create vocabulary, i.e. unique set of tokens that occur across all documents in corpus

def create_vocabulary(corpus):
     vocabulary = set()
     for file in corpus:
          for token in corpus[file]:
               # add token to vocabulary set 
               vocabulary.add(token)
     return sorted(vocabulary) # must be in a particular order 

def is_clean_token(token):
    return re.match(r"^[a-zA-Z\-]+$", token) is not None

# preprocess textual input --> tokens

create_vocab = create_vocabulary(corpus) # create unique set of terms that occur across all documents, n = 42,729

vocabulary = [token for token in create_vocab if is_clean_token(token)] # clean out strange vocabulary items

len(vocabulary)

27374

In [6]:
# create document frequency dictionary - number of documents a given token appears in
# currently takes 3m 35s to run

document_frequency = {token: 0 for token in vocabulary}

for token in vocabulary:
    for document in corpus:
     if token in corpus[document]:
          document_frequency[token] += 1

document_frequency


{'aaa': 1,
 'aafc': 1,
 'aag': 1,
 'aalto': 2,
 'aaltonen': 1,
 'aamulehti': 1,
 'aamuna': 1,
 'aan': 1,
 'aanwerp': 1,
 'aapa': 1,
 'aardappelet': 1,
 'aardvark': 1,
 'aaron': 1,
 'aato': 1,
 'ab': 3,
 'aba': 1,
 'ababa': 1,
 'abacaxi': 1,
 'abahani': 1,
 'abalon': 4,
 'abandon': 32,
 'abang': 1,
 'abangan': 1,
 'abaya': 1,
 'abba': 1,
 'abbado': 2,
 'abbandon': 1,
 'abbasid': 2,
 'abbevil': 1,
 'abbey': 11,
 'abbot': 2,
 'abbott': 4,
 'abbrevi': 14,
 'abc': 8,
 'abckiria': 1,
 'abcnew': 1,
 'abdel': 2,
 'abdic': 4,
 'abdomen': 8,
 'abdomin': 5,
 'abdopu': 1,
 'abdu': 1,
 'abduct': 1,
 'abduh': 1,
 'abdul': 3,
 'abdullah': 1,
 'abdur': 1,
 'abe': 1,
 'abecedario': 1,
 'abel': 2,
 'abelson': 1,
 'aber': 1,
 'aberdar': 1,
 'aberdeen': 1,
 'aberr': 2,
 'aberrantli': 1,
 'abgeordnetenhau': 1,
 'abhor': 1,
 'abidjan': 1,
 'abigail': 3,
 'abil': 41,
 'abisara': 1,
 'abitibibowat': 1,
 'abl': 68,
 'ablaz': 1,
 'ablest': 1,
 'abnorm': 4,
 'abnudi': 1,
 'aboard': 4,
 'abolish': 9,
 'abolit': 5

In [7]:
# pre-compute idf scores using document frequency dictionary

def calculate_idf(term, corpus_dictionary):
     """calculate inverted document frequency"""
     idf = math.log(len(corpus_dictionary)/((document_frequency[term])+1)) # can replace w/frequency count 
     return idf


idf_scores = {}

for item in vocabulary:
     if item not in idf_scores:
          idf_scores[item] = calculate_idf(item, corpus)
          #print(idf_scores[item])

idf_scores


{'aaa': 4.31748811353631,
 'aafc': 4.31748811353631,
 'aag': 4.31748811353631,
 'aalto': 3.912023005428146,
 'aaltonen': 4.31748811353631,
 'aamulehti': 4.31748811353631,
 'aamuna': 4.31748811353631,
 'aan': 4.31748811353631,
 'aanwerp': 4.31748811353631,
 'aapa': 4.31748811353631,
 'aardappelet': 4.31748811353631,
 'aardvark': 4.31748811353631,
 'aaron': 4.31748811353631,
 'aato': 4.31748811353631,
 'ab': 3.624340932976365,
 'aba': 4.31748811353631,
 'ababa': 4.31748811353631,
 'abacaxi': 4.31748811353631,
 'abahani': 4.31748811353631,
 'abalon': 3.4011973816621555,
 'abandon': 1.5141277326297755,
 'abang': 4.31748811353631,
 'abangan': 4.31748811353631,
 'abaya': 4.31748811353631,
 'abba': 4.31748811353631,
 'abbado': 3.912023005428146,
 'abbandon': 4.31748811353631,
 'abbasid': 3.912023005428146,
 'abbevil': 4.31748811353631,
 'abbey': 2.5257286443082556,
 'abbot': 3.912023005428146,
 'abbott': 3.4011973816621555,
 'abbrevi': 2.302585092994046,
 'abc': 2.8134107167600364,
 'abckiria

In [8]:
# turn corpus into pandas dataframe
idf_df_scores = pd.DataFrame.from_dict(idf_scores, orient='index', columns=['IDF Score'])

idf_df_scores

# accessing an idf score: idf_df_scores[term]

Unnamed: 0,IDF Score
aaa,4.317488
aafc,4.317488
aag,4.317488
aalto,3.912023
aaltonen,4.317488
...,...
zwischen,3.912023
zwoll,4.317488
zygomat,3.912023
zygoptera,4.317488


In [9]:
from collections import Counter
# TERM FREQUENCY MATRIX - PER DOCUMENT
# create term-document-matrix - every document converted to a vector corresponding to frequency of each vocab item appearing in that document

def term_document_matrix(corpus, vocab):
       data = {}
       for doc_id, tokens in corpus.items():
              freqs = Counter(tokens)
              for term, count in freqs.items():
                     if term in vocab:
                            if term not in data:
                                   data[term] = {}
                            data[term][doc_id] = count
       df = pd.DataFrame.from_dict(data, orient='index').fillna(0).astype(int)  
       return df


term_document_matrix = term_document_matrix(corpus, vocabulary)

term_document_matrix

Unnamed: 0,S08_set4_a8.txt.clean,S09_set4_a8.txt.clean,S10_set4_a8.txt.clean,S10_set5_a5.txt.clean,S08_set3_a7.txt.clean,S10_set6_a3.txt.clean,S10_set6_a2.txt.clean,S09_set3_a4.txt.clean,S09_set1_a5.txt.clean,S08_set2_a7.txt.clean,...,S10_set2_a4.txt.clean,S08_set1_a9.txt.clean,S08_set1_a8.txt.clean,S09_set2_a4.txt.clean,S08_set2_a3.txt.clean,S09_set1_a1.txt.clean,S09_set3_a6.txt.clean,S09_set3_a2.txt.clean,S09_set4_a5.txt.clean,S08_set4_a5.txt.clean
amedeo,5,5,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
avogadro,33,33,30,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
caricatur,1,1,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
lorenzo,1,1,1,0,0,9,16,2,1,0,...,0,0,0,0,0,0,0,0,0,0
romano,1,1,1,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
uniondal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
cve,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
lha,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
peanut,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
# calculate tf values

def calculate_tf(term, document, term_document_matrix):
     """calculate term frequency"""
     tf = term_document_matrix[term][document]/len(document) # tf = vocab_index[term, document] / len(document)
     return tf


term_document_matrix['S09_set2_a10.txt.clean']["amedeo"]


0

In [11]:
doc_ids = [file for file in os.listdir(path="group_project/text_data") if file.endswith('.clean')]

#sparse_dtype = pd.SparseDtype("float64", fill_value=0.0)
tfidf_df = pd.DataFrame(0.0, index=vocabulary, columns=doc_ids) #dtype=sparse_dtype)

def doc_vectorise(doc_ids, corpus, vocabulary):
    for doc in doc_ids:
        counts = Counter(corpus[doc])
        for term, freq in counts.items():
            if term in vocabulary:
                # calculate tf_idf
                term_document_matrix.loc[term, doc] = (freq / len(corpus[doc])) * idf_scores[term] # or use another tf scheme
    return term_document_matrix

tf_idf_docs_vector = doc_vectorise(doc_ids, corpus, vocabulary)
tf_idf_docs_vector.to_csv("tfidf_output_FINAL.csv")


In [12]:
# process query

def process_query(query):
    query = re.sub("\W", " ", query)
    query = query.strip().lower()
    query = " ".join([word for word in query.split() if word not in stopwords.words('english')])
    return query

test_query = "Was Alessandro Volta a professor of chemistry?"
print(process_query(test_query))

split_query = np.unique(test_query.split())

split_query



#def query_score(vocab_index, query):
  #  for word in np.unique(query.)

alessandro volta professor chemistry


array(['Alessandro', 'Volta', 'Was', 'a', 'chemistry?', 'of', 'professor'],
      dtype='<U10')

In [13]:
# PROCESS USER QUERY

def process_query(user_query):
     """process user query - tokenize"""
     tokens = tokenize(user_query) # apply same tokeizer used on corpus data
     return tokens



def vectorise_query(user_query, corpus_vector): # corpus vector = tf_idf dataframe
     corpus_vector["query_vector"] = 0
     print(corpus_vector["query_vector"])
     tokenized_query = process_query(user_query)
     counts = Counter(tokenized_query)
     print(counts)
     query_length = len(tokenized_query)
     for term, freq in counts.items():
          if term in corpus_vector.index:
               # calculate tf_idf
               corpus_vector["query_vector"][term] = freq/query_length * idf_scores[term] # or use another tf scheme
     return corpus_vector["query_vector"]


tf_idf_docs_vector.index

query = "Was Alessandro Volta a professor of chemistry?"
query_vector = vectorise_query(query, corpus_vector=tf_idf_docs_vector)

query_vector.to_csv("query_vector.csv")

amedeo       0
avogadro     0
caricatur    0
lorenzo      0
romano       0
            ..
uniondal     0
cve          0
lha          0
peanut       0
pepperidg    0
Name: query_vector, Length: 27374, dtype: int64
Counter({'alessandro': 1, 'volta': 1, 'professor': 1, 'chemistri': 1})


  corpus_vector["query_vector"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_vector["query_vector"][term] = freq/query_length * idf_scores[term] # or use another tf scheme


In [14]:
# COSINE SIMILARITY - method to compare the query with each document in the corpus

def calculate_cosine_similiarity(vocab_index, doc_index, query_scores):
     """calculate cosine similarity in order to compare query vector with document vector"""
     cosine_scores = {}
     query_scalar = np.sqrt(sum(vocab_index[query_scores] ** 2))
     for doc in doc_index:
          doc_scalar = np.sqrt(sum(vocab_index[doc] ** 2))
          dot_prod = sum(vocab_index[doc] * vocab_index[query_scores])
          cosine = (dot_prod / (query_scalar * doc_scalar))
          cosine_scores[doc] = cosine
     # sorted cosine scores by cosine score value
     sorted_cosine_scores = {key: value for key, value in sorted(cosine_scores.items(), key=lambda item: item[1], reverse=True)}
     return sorted_cosine_scores


cosine_scores = calculate_cosine_similiarity(tf_idf_docs_vector, doc_ids, 'query_vector')
cosine_scores

{'S10_set4_a10.txt.clean': 0.5677234595983246,
 'S08_set4_a10.txt.clean': 0.5455429551645148,
 'S09_set4_a10.txt.clean': 0.5455429551645148,
 'S08_set4_a7.txt.clean': 0.02686988435481536,
 'S09_set4_a7.txt.clean': 0.02686988435481536,
 'S08_set2_a5.txt.clean': 0.025452287575430735,
 'S10_set4_a8.txt.clean': 0.025089337233543143,
 'S10_set4_a7.txt.clean': 0.018575027807556984,
 'S08_set4_a8.txt.clean': 0.016458278877261878,
 'S09_set4_a8.txt.clean': 0.016458278877261878,
 'S10_set4_a5.txt.clean': 0.011609818416918388,
 'S09_set4_a5.txt.clean': 0.008874672950072826,
 'S08_set4_a5.txt.clean': 0.008874672950072826,
 'S08_set4_a2.txt.clean': 0.007245708060958675,
 'S09_set4_a2.txt.clean': 0.007245708060958675,
 'S09_set5_a9.txt.clean': 0.007208521059359583,
 'S09_set5_a1.txt.clean': 0.006591812980034473,
 'S10_set4_a2.txt.clean': 0.005294197797100052,
 'S09_set5_a2.txt.clean': 0.004195820766603817,
 'S10_set3_a4.txt.clean': 0.003654173037542788,
 'S10_set6_a8.txt.clean': 0.00324421751114851

In [15]:
# calculate cosine similarity values across all documents given query vector 
top_10_cosines = list(calculate_cosine_similiarity(tf_idf_docs_vector, doc_ids, 'query_vector'))[:10]

top_10_cosines

['S10_set4_a10.txt.clean',
 'S08_set4_a10.txt.clean',
 'S09_set4_a10.txt.clean',
 'S08_set4_a7.txt.clean',
 'S09_set4_a7.txt.clean',
 'S08_set2_a5.txt.clean',
 'S10_set4_a8.txt.clean',
 'S10_set4_a7.txt.clean',
 'S08_set4_a8.txt.clean',
 'S09_set4_a8.txt.clean']

In [68]:
# load datasets into pandas dfs

s08 = pd.read_csv("group_project/S08_question_answer_pairs.txt", delimiter='\t', encoding = "latin-1")

s09 = pd.read_csv("group_project/S09_question_answer_pairs.txt", delimiter='\t', encoding = "latin-1")

s10 = pd.read_csv("group_project/S10_question_answer_pairs.txt", delimiter='\t', encoding = "latin-1")

set_1 = s08[["Question", "ArticleFile"]]
set_2 = s09[["Question", "ArticleFile"]]
set_3 = s10[["Question", "ArticleFile"]]

pd_lst = [set_1, set_2, set_3]

questions_lst = pd.concat(pd_lst)

questions_lst



Unnamed: 0,Question,ArticleFile
0,Was Abraham Lincoln the sixteenth President of...,S08_set3_a4
1,Was Abraham Lincoln the sixteenth President of...,S08_set3_a4
2,Did Lincoln sign the National Banking Act of 1...,S08_set3_a4
3,Did Lincoln sign the National Banking Act of 1...,S08_set3_a4
4,Did his mother die of pneumonia?,S08_set3_a4
...,...,...
1453,What areas do the Grevy's Zebras inhabit?,S10_set1_a9
1454,Which species of zebra is known as the common ...,S10_set1_a9
1455,Which species of zebra is known as the common ...,S10_set1_a9
1456,At what age can a zebra breed?,S10_set1_a9


In [None]:
# get list of questions and their corresponding document ids


convert_to_dict = questions_lst.to_dict(orient='records')

question_document_pairs = {entry['Question']: entry['ArticleFile'] for entry in convert_to_dict}

question_document_pairs

{'Was Abraham Lincoln the sixteenth President of the United States?': 'S08_set3_a4',
 'Did Lincoln sign the National Banking Act of 1863?': 'S08_set3_a4',
 'Did his mother die of pneumonia?': 'S08_set3_a4',
 "How many long was Lincoln's formal education?": 'S08_set3_a4',
 'When did Lincoln begin his political career?': 'S08_set3_a4',
 'What did The Legal Tender Act of 1862 establish?': 'S08_set3_a4',
 'Who suggested Lincoln grow a beard?': 'S08_set3_a4',
 'When did the Gettysburg address argue that America was born?': 'S08_set3_a4',
 'Did Lincoln beat John C. Breckinridge in the 1860 election?': 'S08_set3_a4',
 'Was Abraham Lincoln the first President of the United States?': 'S08_set3_a4',
 'Did Lincoln start his political career in 1832?': 'S08_set3_a4',
 'Did Lincoln ever represent Alton & Sangamon Railroad?': 'S08_set3_a4',
 'Which county was Lincoln born in?': 'S08_set3_a4',
 'When did Lincoln first serve as President?': 'S08_set3_a4',
 'Who assassinated Lincoln?': 'S08_set3_a4',
 