In [1]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /Users/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# reading in the data
train_data = pd.read_csv('WikiQA-train.tsv', sep='\t')
test_data = pd.read_csv('WikiQA-test.tsv', sep='\t')

Extract the unique questions from the train and test data frames, including the documentID and the DocumentTitle

In [3]:
def get_questions_documenttag(data):
    qd = data[['Question', 'QuestionID', 'DocumentID','DocumentTitle']].drop_duplicates()
    return qd
train_question_doctag = get_questions_documenttag(train_data)
test_question_doctag = get_questions_documenttag(test_data)

In [4]:
# get unique questions
train_questions = train_question_doctag['Question']
test_questions = test_question_doctag['Question']

In [5]:
# get the unique document ids
train_docid = train_question_doctag['DocumentID']
test_docid = test_question_doctag['DocumentID']

Extract the answers to those questions.

In [140]:
def get_answers(data, questions, documentids): 
    answers = [] # list of answers
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q] # add the document id
        df = data[data['Question'] == question]
        index = df.loc[df['Label'] == 1]['Sentence'].index.values
        if len(index) == 0: # if no answer found
            answers.append([question, doc_id, 'No answer'])
        else: # if 1 answer found
            answers.append([question, doc_id, df.loc[index[0], "Sentence"]])
    return answers

train_answers = pd.DataFrame(get_answers(train_data, train_questions, train_docid))
test_answers = pd.DataFrame(get_answers(test_data, test_questions, test_docid))

The above get_answers returns train_answers and test_answers which, gives us in the following columns
- Question
- Related Document ID
- Answer (if no answer to that question, return no answer)

In [141]:
def get_documents(data, questions, documentids): # (done by Finn, tweaked by Dan)
    documents = []
    for q in range(len(questions)):
        question = questions.iloc[q]
        doc_id = documentids.iloc[q] # add the document id
        df = data[data['Question'] == question]
        sentences = df['Sentence'].tolist()
        for i in range(0, len(sentences) - 1):
            sentences[i] = sentences[i] + ' '
        documents.append([doc_id,''.join(sentences)])
    return documents

train_documents = pd.DataFrame(get_documents(train_data, train_questions, train_docid)) # return the individual document in list
test_documents = pd.DataFrame(get_documents(test_data, test_questions, test_docid)) # return the individual document in list

The above train_documents and test_documents called from the get_documents gives us in the following columns
- Document ID
- Full Document

In [142]:
# renaming all the columns for more standardised access
train_answers.columns = ['Question','DocumentID','Answer']
test_answers.columns = ['Question','DocumentID','Answer']
train_documents.columns = ['DocumentID','Document']
test_documents.columns = ['DocumentID','Document']

In [143]:
# result is 2117, 2117, 630, 630

len(train_answers),len(train_documents), len(test_answers),len(test_documents)

(2117, 2117, 630, 630)

**Prior to tagging, we should maybe clean the document and answers first:** (stopped here)

Maybe? 
- lowercase (might lose context, but we can use on questions)
- removing any punctuation or weird symbols (do)
- removal of stop words? (probably not)

Make sure that the pre-processing is standardised to be the same throughout doc and ans.

In [144]:
def preprocess_lower(text):
    # Lowercase the text for question, answer and documents
    text = text.lower()
    pattern = r'[^a-zA-Z0-9\s]'
    cleaned_text = re.sub(pattern, ' ', text)
    return cleaned_text

train_answers[['Question', 'Answer']] = train_answers[['Question', 'Answer']].applymap(preprocess_lower)
train_documents['Document'] = train_documents['Document'].apply(preprocess_lower)
test_answers[['Question', 'Answer']] = test_answers[['Question', 'Answer']].applymap(preprocess_lower)
test_documents['Document'] = test_documents['Document'].apply(preprocess_lower)

In [156]:
def labelling(documents, answers):
    tagged_documents = []
    for q in range(len(answers)):
        tagged_document = []
        qn = answers['Question'].loc[q]
        doc_id = answers['DocumentID'].loc[q]
        content = documents.loc[documents['DocumentID'] == doc_id,'Document'].values[0]
        answer = answers['Answer'].loc[q]

        if answer == 'no answer':
            tokens = word_tokenize(content)
            for j in range(len(tokens)):
                tagged_document.append('N') # none 
        else:
            parts = content.partition(answer)
            for j in range(len(parts)):
                tokens = word_tokenize(parts[j])
                if j == 1:
                    tagged_document.append('S') # start of answer
                    for k in range(len(tokens) - 2):
                        tagged_document.append('I') # inside of answer
                    tagged_document.append('E') # end of answer
                else:
                    for k in range(len(tokens)):
                        tagged_document.append('N') # outside answer
        tagged_documents.append(tagged_document)
    return(tagged_documents)

train_doc_ans_labels = labelling(train_documents, train_answers)
test_doc_ans_labels = labelling(test_documents, test_answers)

In [164]:
# check if tags are good
def testing_tokens(ind, labels, documents, answers):
    for i,j in zip(labels[ind],word_tokenize(documents['Document'][ind])):
        print([i,j])
    print(answers['Answer'][ind])
testing_tokens(144, train_doc_ans_labels, train_documents, train_answers)

['N', 'the']
['N', 'pineapple']
['N', 'ananas']
['N', 'comosus']
['N', 'is']
['N', 'a']
['N', 'tropical']
['N', 'plant']
['N', 'with']
['N', 'edible']
['N', 'multiple']
['N', 'fruit']
['N', 'consisting']
['N', 'of']
['N', 'coalesced']
['N', 'berries']
['N', 'and']
['N', 'the']
['N', 'most']
['N', 'economically']
['N', 'significant']
['N', 'plant']
['N', 'in']
['N', 'the']
['N', 'bromeliaceae']
['N', 'family']
['S', 'pineapples']
['I', 'may']
['I', 'be']
['I', 'cultivated']
['I', 'from']
['I', 'a']
['I', 'crown']
['I', 'cutting']
['I', 'of']
['I', 'the']
['I', 'fruit']
['I', 'possibly']
['I', 'flowering']
['I', 'in']
['I', '20']
['I', '24']
['I', 'months']
['I', 'and']
['I', 'fruiting']
['I', 'in']
['I', 'the']
['I', 'following']
['I', 'six']
['E', 'months']
['N', 'pineapple']
['N', 'does']
['N', 'not']
['N', 'ripen']
['N', 'significantly']
['N', 'post']
['N', 'harvest']
['N', 'pineapples']
['N', 'are']
['N', 'consumed']
['N', 'fresh']
['N', 'cooked']
['N', 'juiced']
['N', 'and']
['N', 

Cleaned Documents: train and test

train_answers - contains the ['Question','DocumentID','Answer'] 

train_documents - contains the ['DocumentID','Document']

train_doc_ans_labels - contains a list of list of answer tags for each document, 

In [165]:
# To prepare the document for word embeddings:
train_doc_ques = pd.DataFrame({'Document': train_documents['Document'],
                               'Question': train_answers['Question']})
test_doc_ques = pd.DataFrame({'Document': test_documents['Document'],
                               'Question': test_answers['Question']})

### Word Embeddings

To use the CBOW model, we need the data in sentences. Extract this from the original dataset, don't use sent_tokenise, will mess with some of the fullstops, we want to maintain structure from above

In [166]:
def word_tokens(data):
    sentence_list = []
    for i in range(len(data)):
        sentence_list.append(word_tokenize(data[i]))
    return(sentence_list)
train_doc_list = word_tokens(train_doc_ques['Document'])
train_ques_list = word_tokens(train_doc_ques['Question'])
test_doc_list = word_tokens(test_doc_ques['Document'])
test_ques_list = word_tokens(test_doc_ques['Question'])

In [167]:
combined_text = train_doc_list + train_ques_list + test_doc_list + test_ques_list

In [168]:
wc_cbow_model = Word2Vec(sentences=combined_text, vector_size=100, window=5, min_count=1, workers=2, epochs=30)
wc_cbow_model.save("cbow.model")

To implement QA

1. Word Embeddings, using CBOW
2. Feature Extraction 1 - POS tags
3. Feature Extraction 2 - TF-IDF 
4. Feature Extraction 3 - NER

In [169]:
def get_word_embeddings(doc):
    tokenized_doc = word_tokenize(doc)
    embeddings = [wc_cbow_model.wv[word] for word in tokenized_doc]
    return embeddings

train_doc_ques['Doc_Embeddings'] = train_doc_ques['Document'].apply(get_word_embeddings)
train_doc_ques['Q_Embeddings'] = train_doc_ques['Question'].apply(get_word_embeddings)
test_doc_ques['Doc_Embeddings'] = test_doc_ques['Document'].apply(get_word_embeddings)
test_doc_ques['Q_Embeddings'] = test_doc_ques['Question'].apply(get_word_embeddings)

In [170]:
train_doc_ques['Doc_Tokens'] = train_doc_ques['Document'].apply(word_tokenize)
train_doc_ques['Q_Tokens'] =  train_doc_ques['Question'].apply(word_tokenize)
test_doc_ques['Doc_Tokens'] = test_doc_ques['Document'].apply(word_tokenize)
test_doc_ques['Q_Tokens'] = test_doc_ques['Question'].apply(word_tokenize)

In [171]:
def check_count(doc):
    count = 0
    for i in range(len(doc)):
        if len(doc['Doc_Embeddings'][i]) != len(doc['Doc_Tokens'][i]):
            count += 1
        elif len(doc['Q_Embeddings'][i]) != len(doc['Q_Tokens'][i]):
            count += 1
        else:
            continue
    return(count)
        
check_count(train_doc_ques) # looks good

0

Note, need to convert the POS tags, NER tags into embeddings. After this, pad the questions and answers to the max question/document length in the combined training and test set.

### PoS Tagging

In [172]:
# Apply the pos tags to the tokens 
from nltk.tag import pos_tag
# download the dependency and resource as required
nltk.download('averaged_perceptron_tagger')

train_doc_ques['Doc_POS'] = train_doc_ques['Doc_Tokens'].apply(pos_tag)
train_doc_ques['Q_POS'] =  train_doc_ques['Q_Tokens'].apply(pos_tag)
test_doc_ques['Doc_POS'] = test_doc_ques['Doc_Tokens'].apply(pos_tag)
test_doc_ques['Q_POS'] = test_doc_ques['Q_Tokens'].apply(pos_tag)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/daniel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [173]:
# checking the POS tags: # looks ok
test_doc_ques['Q_POS'][0]

[('how', 'WRB'),
 ('african', 'JJ'),
 ('americans', 'NNS'),
 ('were', 'VBD'),
 ('immigrated', 'VBN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('us', 'PRP')]

In [174]:
# Extract all unique POS Tags
all_pos_tags = train_doc_ques['Doc_POS'].tolist() + test_doc_ques['Doc_POS'].tolist() + train_doc_ques['Q_POS'].tolist() + test_doc_ques['Q_POS'].tolist()

def get_unique_pos(data):
    pos_tags = set()
    for item in data:
        for _,pos_tag in item:
            pos_tags.add(pos_tag)

    pos_tag_index = {tag: i for i, tag in enumerate(sorted(pos_tags))}
    return pos_tag_index

pos_iden = get_unique_pos(all_pos_tags) # list of tags
pos_iden

{'$': 0,
 'CC': 1,
 'CD': 2,
 'DT': 3,
 'EX': 4,
 'FW': 5,
 'IN': 6,
 'JJ': 7,
 'JJR': 8,
 'JJS': 9,
 'MD': 10,
 'NN': 11,
 'NNP': 12,
 'NNPS': 13,
 'NNS': 14,
 'PDT': 15,
 'POS': 16,
 'PRP': 17,
 'PRP$': 18,
 'RB': 19,
 'RBR': 20,
 'RBS': 21,
 'RP': 22,
 'SYM': 23,
 'TO': 24,
 'UH': 25,
 'VB': 26,
 'VBD': 27,
 'VBG': 28,
 'VBN': 29,
 'VBP': 30,
 'VBZ': 31,
 'WDT': 32,
 'WP': 33,
 'WP$': 34,
 'WRB': 35}

### NER Tagging

### Steps to run this:

- pip install spacy 
- python -m spacy download en_core_web_sm

If loaded for the first time, restart kernel

In [175]:
# nltk using Spacy
# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy
import en_core_web_sm

# loading pre-trained model of NER
nlp = en_core_web_sm.load()

In [176]:
def ner_tagging(texts):
    tagged_texts = []
    for text in texts:
        doc = spacy.tokens.Doc(nlp.vocab, words=text)
        nlp.get_pipe("ner")(doc)
        tagged_texts.append([(token.text, token.ent_type_) for token in doc])
    return tagged_texts

In [177]:
# Will take a while...
train_doc_ques['Doc_NER'] = ner_tagging(train_doc_ques['Doc_Tokens'])
train_doc_ques['Q_NER'] = ner_tagging(train_doc_ques['Q_Tokens'])
test_doc_ques['Doc_NER'] = ner_tagging(test_doc_ques['Doc_Tokens'])
test_doc_ques['Q_NER'] = ner_tagging(test_doc_ques['Q_Tokens'])

In [178]:
# Similar approach to the POS

# Extract all unique POS Tags
all_ner_tags = train_doc_ques['Doc_NER'].tolist() + test_doc_ques['Doc_NER'].tolist() + train_doc_ques['Q_NER'].tolist() + test_doc_ques['Q_NER'].tolist()

def get_unique_ner(data):
    ner_tags = set()
    for item in data:
        for _,ner_tag in item:
            ner_tags.add(ner_tag)

    ner_tag_index = {tag: i for i, tag in enumerate(sorted(ner_tags))}
    return ner_tag_index

ner_iden = get_unique_pos(all_ner_tags) # list of tags
ner_iden

{'': 0,
 'CARDINAL': 1,
 'DATE': 2,
 'EVENT': 3,
 'FAC': 4,
 'GPE': 5,
 'LANGUAGE': 6,
 'LAW': 7,
 'LOC': 8,
 'MONEY': 9,
 'NORP': 10,
 'ORDINAL': 11,
 'ORG': 12,
 'PERCENT': 13,
 'PERSON': 14,
 'PRODUCT': 15,
 'QUANTITY': 16,
 'TIME': 17,
 'WORK_OF_ART': 18}

In [179]:
ner_idx = ner_iden.values()
aa = np.eye(max(ner_idx) + 1)
len(aa)

19

In [180]:
def one_hot_vectorize(pos_tagger, ner_tagger, data): # pass in the unique dict for ner or pos
    pos_idx = pos_tagger.values()
    pos_ohv = np.eye(max(pos_idx) + 1) # create the ohv
    ner_idx = ner_tagger.values()
    ner_ohv = np.eye(max(ner_idx) + 1)
    
    dpos_full_ohv, dner_full_ohv = [], [] # lists to append to 
    qpos_full_ohv, qner_full_ohv = [], [] # lists to append to

    for item in data['Doc_POS']:
        sent_ohv = []
        for word in item:
            tag = word[1]
            pos_index_iden = pos_tagger[tag]
            sent_ohv.append(pos_ohv[pos_index_iden])
        dpos_full_ohv.append(sent_ohv)
    
    for item in data['Q_POS']:
        sent_ohv = []
        for word in item:
            tag = word[1]
            pos_index_iden = pos_tagger[tag]
            sent_ohv.append(pos_ohv[pos_index_iden])
        qpos_full_ohv.append(sent_ohv)
    
    for item in data['Doc_NER']:
        sent_ohv = []
        for word in item:
            tag = word[1]
            ner_index_iden = ner_tagger[tag]
            sent_ohv.append(ner_ohv[ner_index_iden])
        dner_full_ohv.append(sent_ohv)
    
    for item in data['Q_NER']:
        sent_ohv = []
        for word in item:
            tag = word[1]
            ner_index_iden = ner_tagger[tag]
            sent_ohv.append(ner_ohv[ner_index_iden])
        qner_full_ohv.append(sent_ohv)
    
    return(dpos_full_ohv, qpos_full_ohv, dner_full_ohv, qner_full_ohv)


In [181]:
# get the ohv for doc
train_doc_pos_ohv, train_q_pos_ohv, train_doc_ner_ohv, train_q_ner_ohv = one_hot_vectorize(pos_iden, ner_iden, train_doc_ques)
test_doc_pos_ohv, test_q_pos_ohv, test_doc_ner_ohv, test_q_ner_ohv = one_hot_vectorize(pos_iden, ner_iden, test_doc_ques)

In [189]:
train_doc_ques[:300]['Doc_Tokens'][290]

['sidney',
 'patrick',
 'crosby',
 'ons',
 'born',
 'august',
 '7',
 '1987',
 'is',
 'a',
 'canadian',
 'professional',
 'ice',
 'hockey',
 'player',
 'who',
 'is',
 'captain',
 'of',
 'the',
 'pittsburgh',
 'penguins',
 'of',
 'the',
 'national',
 'hockey',
 'league',
 'nhl',
 'crosby',
 'was',
 'drafted',
 'first',
 'overall',
 'by',
 'the',
 'penguins',
 'out',
 'of',
 'the',
 'quebec',
 'major',
 'junior',
 'hockey',
 'league',
 'qmjhl',
 'during',
 'his',
 'two',
 'year',
 'major',
 'junior',
 'career',
 'with',
 'the',
 'rimouski',
 'oc',
 'anic',
 'he',
 'earned',
 'back',
 'to',
 'back',
 'chl',
 'player',
 'of',
 'the',
 'year',
 'awards',
 'and',
 'led',
 'his',
 'club',
 'to',
 'the',
 '2005',
 'memorial',
 'cup',
 'final',
 'nicknamed',
 'the',
 'next',
 'one',
 'he',
 'was',
 'one',
 'of',
 'the',
 'most',
 'highly',
 'regarded',
 'draft',
 'picks',
 'in',
 'hockey',
 'history',
 'leading',
 'many',
 'to',
 'refer',
 'to',
 'the',
 '2005',
 'draft',
 'lottery',
 'as',
 'th

In [183]:
# reduce the dataframe to just tokens and embeddings:
doc_emb_train = train_doc_ques[['Doc_Tokens','Doc_Embeddings']]
doc_pos_ner = pd.DataFrame({'Doc_POS':train_doc_pos_ohv,
              'Doc_NER':train_doc_ner_ohv})
doc_emb_train = pd.concat([doc_emb_train, doc_pos_ner], axis=1)

q_emb_train = train_doc_ques[['Q_Tokens','Q_Embeddings']]
q_pos_ner = pd.DataFrame({'Q_POS':train_q_pos_ohv,
              'Q_NER':train_q_ner_ohv})
q_emb_train = pd.concat([q_emb_train, q_pos_ner], axis=1)

In [184]:
doc_emb_test = test_doc_ques[['Doc_Tokens','Doc_Embeddings']]
doc_pos_ner = pd.DataFrame({'Doc_POS':test_doc_pos_ohv,
              'Doc_NER':test_doc_ner_ohv})
doc_emb_test = pd.concat([doc_emb_test, doc_pos_ner], axis=1)

q_emb_test = test_doc_ques[['Q_Tokens','Q_Embeddings']]
q_pos_ner = pd.DataFrame({'Q_POS':test_q_pos_ohv,
              'Q_NER':test_q_ner_ohv})
q_emb_test = pd.concat([q_emb_test, q_pos_ner], axis=1)

### Word Embeddings (Doc and Qn)
- Still have to add TF-IDF.

The embeddings of the questions and answers of the train and test set can be found here:

- Train Document - doc_emb_train
- Train Q - q_emb_train
- Test Document - doc_emb_test
- Test Q - q_emb_test

In [190]:
q_emb_test

Unnamed: 0,Q_Tokens,Q_Embeddings,Q_POS,Q_NER
0,"[how, african, americans, were, immigrated, to...","[[-1.2972423, 0.61705947, -2.289424, 1.9959564...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,"[how, large, were, early, jails]","[[-1.2972423, 0.61705947, -2.289424, 1.9959564...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,"[how, a, water, pump, works]","[[-1.2972423, 0.61705947, -2.289424, 1.9959564...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,"[how, old, was, sue, lyon, when, she, made, lo...","[[-1.2972423, 0.61705947, -2.289424, 1.9959564...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,"[how, are, antibodies, used, in]","[[-1.2972423, 0.61705947, -2.289424, 1.9959564...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...,...
625,"[where, is, the, brisket, from]","[[-0.59588027, -1.4281421, 0.5773438, 1.183183...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
626,"[what, is, arm, chipset]","[[-1.1253945, 0.032915913, -1.9591076, -2.8268...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
627,"[what, is, the, life, span, of, june, bugs]","[[-1.1253945, 0.032915913, -1.9591076, -2.8268...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
628,"[who, is, the, youngest, female, to, give, bir...","[[-1.9893861, 2.442811, 0.9638027, 0.842218, -...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [191]:
# find max_length of the document
max_len_doc = 0
for i in doc_emb_train['Doc_Tokens']:
    if len(i) > max_len_doc:
        max_len_doc = len(i)
    else:
        continue
        
for i in doc_emb_test['Doc_Tokens']:
    if len(i) > max_len_doc:
        max_len_doc = len(i)
    else:
        continue
max_len_doc

1675

In [192]:
# find max_length of question
max_len_qn = 0
for i in q_emb_train['Q_Tokens']:
    if len(i) > max_len_qn:
        max_len_qn = len(i)
    else:
        continue

for i in q_emb_test['Q_Tokens']:
    if len(i) > max_len_qn:
        max_len_qn = len(i)
    else:
        continue
max_len_qn

23

In [291]:
training_corpus = train_doc_ques[['Doc_Tokens', 'Q_Tokens']]
test_corpus = test_doc_ques[['Doc_Tokens', 'Q_Tokens']]

# Flatten the lists in each row and concatenate them
def get_squeeze(corpus):
    combined_list = []
    for _, row in corpus.iterrows():
        combined_list.extend(row['Doc_Tokens'])
        combined_list.extend(row['Q_Tokens'])
    return(combined_list)
df_training = get_squeeze(training_corpus)
df_test = get_squeeze(test_corpus)

In [294]:
# Initialize a default dictionary with int
def diction(df):
    keys = dict.fromkeys(set(df),0)
    for token in df:
        keys[token] += 1
    return keys
d_train, d_test = diction(df_training), diction(df_test)

In [299]:
len(d_test)

16299

In [None]:
## TF-IDF-oriented function:
def tfidf_wikisearch(documents):
  ## Write your function code body from here (** the following comments are only for instructional reference)
  
  N = len(documents)
  print("Total docs in documents: {}".format(N))
  docs = []

  # Add the three additional stop words, you can also do it out of this function
  sww = sw.words()
  sww.extend(['brisbane', 'melbourne', 'sydney'])

  # process the document
  for item in range(N):
    cleaned_page = re.sub(r'[^\w\s]','', documents[item])
    tokenized_page = word_tokenize(cleaned_page)
    lower_tokens = [t.lower() for t in tokenized_page]
    tokenized_doc = [w for w in lower_tokens if not w in sww]
    docs.append(tokenized_doc)

  # calculate the TF-IDF values for each unique words
  DF = {}
  for page in docs:
    # get each unique word in the doc - and count the number of occurrences in the document
    for term in np.unique(page):
        try:
            DF[term] +=1
        except:
            DF[term] =1

  doc_id = 0
  tf_idf = {} # dictionary of words and weights (tf-idf)

  for tk_doc in docs:
    # init counter
    count = Counter(tk_doc)
    # count the total number of words in the document
    total_words = len(tk_doc)
    # get each unique word in the document
    for word in np.unique(tk_doc):
      # Calculate TF, DF, IDF and TF-IDF
      tf = count[word]/total_words # occurance in document
      df = DF[word] # occurance in corpus
      idf = math.log(N/(df+1))+1
      tf_idf[doc_id, word] = tf*idf
    doc_id += 1

  # sorting the words based on the TF-IDF value and get the word with top-1 TF-IDF value
  sorted_dict = sorted(tf_idf.items(), key=lambda x: x[1], reverse=True)
  top = sorted_dict[0]

  # search the wiki page for this word and print out the page content
  top_word = top[0][1]
  top_page = wikipedia.page(top_word).content
  print("The word with the highest TF-IDF score is : '{}' - {}. \n".format(top_word,top[1]))
  
  return(top_page)