### Requirements: simplified_wiki and populated_samples json files generated by Data_Preprocessing.ipynb

In [400]:
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel

In [401]:
data = pd.read_json("simplified_wiki.jsonl", lines = True)
df = pd.read_json("populated_samples.jsonl", lines = True)

In [402]:
df

Unnamed: 0,label,claim,evidences
0,SUPPORTS,Nikolaj Coster-Waldau worked with the Fox Broa...,"[[Nikolaj_Coster-Waldau, 7, He then played Det..."
1,SUPPORTS,Roman Atwood is a content creator.,"[[Roman_Atwood, 1, He is best known for his vl..."
2,SUPPORTS,"History of art includes architecture, dance, s...","[[History_of_art, 2, The subsequent expansion ..."
3,REFUTES,Adrienne Bailon is an accountant.,"[[Adrienne_Bailon, 0, Adrienne Eliza Houghton ..."
4,NOT ENOUGH INFO,System of a Down briefly disbanded in limbo.,[]
...,...,...,...
144725,REFUTES,Led Zeppelin released an eponymous debut album...,"[[Led_Zeppelin, 6, Although the group was init..."
144726,SUPPORTS,Taal was romantic.,"[[Taal_-LRB-film-RRB-, 0, Taal LRB English Rhy..."
144727,SUPPORTS,Her stars American actress Rooney Mara.,"[[Her_-LRB-film-RRB-, 3, The film also stars A..."
144728,SUPPORTS,J. R. R. Tolkien created Gimli.,"[[Gimli_-LRB-Middle-earth-RRB-, 0, Gimli is a ..."


In [403]:
data

Unnamed: 0,id,sentences,text
0,19_Kids_and_Counting,[19 Kids and Counting LRB formerly 17 Kids and...,19 Kids and Counting LRB formerly 17 Kids and ...
1,16th_IIFA_Awards,[The 2015 IIFA Awards officially known as the ...,The 2015 IIFA Awards officially known as the ...
2,12_Play,[12 Play is the debut studio album by American...,12 Play is the debut studio album by American ...
3,1998_NFL_season,[The 1998 NFL season was the 79th regular seas...,The 1998 NFL season was the 79th regular seaso...
4,12-hour_clock,[The 12 hour clock is a time convention in whi...,The 12hour clock is a time convention in which...
...,...,...,...
19903,Xochitlán_Todos_Santos,[Xochitl n Todos Santos LRB municipality RRB i...,Xochitln Todos Santos LRB municipality RRB is ...
19904,Øystein_Skar,[ystein Skar LRB born 15 October 1985 in Vinst...,ystein Skar LRB born 15 October 1985 in Vinstr...
19905,Överskottsbolaget,[oB LRB short for verskottsbolaget RRB is a Sw...,oB LRB short for verskottsbolaget RRB is a Swe...
19906,X_Marks_the_Spot_-LRB-1942_film-RRB-,[X Marks the Spot is a 1942 film noir crime fi...,X Marks the Spot is a 1942 film noir crime fil...


## Change the data split or use the entire dataset

In [404]:
t, test = train_test_split(df, test_size=0.05, random_state=1234, shuffle=True)
test.reset_index(inplace = True)

In [405]:
test["label1"] = test.label.apply(label2int)
test.drop(["label", "index"], axis="columns", inplace=True)
test.rename(columns = {"label1": "label"}, inplace=True)

In [406]:
test

Unnamed: 0,claim,evidences,label
0,Liv Tyler performed in The Leftovers.,"[[Liv_Tyler, 16, In 2014 Tyler made her televi...",0
1,The Beatles played music in Hamburg.,[],2
2,Reuters transmits news in Hindi.,[],2
3,The French Revolution led to the Reign of Terr...,"[[French_Revolution, 21, The dictatorship impo...",1
4,Mrigayaa won at the 24th National Film Awards ...,[],2
...,...,...,...
7232,Will Ferrell starred in Anchorman: The Legend ...,"[[Will_Ferrell, 1, He first established himsel...",0
7233,Ryan Dusick is a record producer who is American.,"[[Ryan_Dusick, 0, Ryan Michael Dusick LRB born...",0
7234,Ringo Starr made the official website for the ...,[],2
7235,"In 1921, Douglas Aircraft Company was founded.","[[Douglas_Aircraft_Company, 1, It was founded ...",0


In [407]:
tf_idf = TfidfVectorizer()

In [408]:
vector = tf_idf.fit_transform(data.text)
test_vector = tf_idf.transform(test.claim.values.tolist())

In [409]:
def get_top(data, samples, top=5):
    similarity = linear_kernel(samples, data)
    return np.argsort(similarity)[:, -top:][::]

In [410]:
def getDocsID(sample):
    ret = set()
    for s in sample:
        ret.add(s[0])
    return list(ret)

def getEviID(sample):
    ret = []
    for s in sample:
        ret.append(s[0] + "_" + str(s[1]))
    return ret

In [411]:
test["docsID"] = test.evidences.apply(getDocsID)
test["eviID"] = test.evidences.apply(getEviID)

In [412]:
test

Unnamed: 0,claim,evidences,label,docsID,eviID
0,Liv Tyler performed in The Leftovers.,"[[Liv_Tyler, 16, In 2014 Tyler made her televi...",0,[Liv_Tyler],[Liv_Tyler_16]
1,The Beatles played music in Hamburg.,[],2,[],[]
2,Reuters transmits news in Hindi.,[],2,[],[]
3,The French Revolution led to the Reign of Terr...,"[[French_Revolution, 21, The dictatorship impo...",1,"[French_Revolution, Reign_of_Terror]","[French_Revolution_21, Reign_of_Terror_0, Reig..."
4,Mrigayaa won at the 24th National Film Awards ...,[],2,[],[]
...,...,...,...,...,...
7232,Will Ferrell starred in Anchorman: The Legend ...,"[[Will_Ferrell, 1, He first established himsel...",0,[Will_Ferrell],[Will_Ferrell_1]
7233,Ryan Dusick is a record producer who is American.,"[[Ryan_Dusick, 0, Ryan Michael Dusick LRB born...",0,[Ryan_Dusick],"[Ryan_Dusick_0, Ryan_Dusick_0, Ryan_Dusick_0]"
7234,Ringo Starr made the official website for the ...,[],2,[],[]
7235,"In 1921, Douglas Aircraft Company was founded.","[[Douglas_Aircraft_Company, 1, It was founded ...",0,[Douglas_Aircraft_Company],[Douglas_Aircraft_Company_1]


### NOTE:

### This algo is a reduced version of the getRetrievalData method used in tf-iDF based Document Retrieval and Sentence Retrieval. It removes the creation of the dataset and adds the calculation of correctly retrieved Documents and Sentences for faster calculation.

In [413]:
def getRetrievalAccuracies(samples, sample_vec, top_num = [5]):
    ret_accuracy = {}
    evi_accuracy = {}
    
    for t in top_num:
        
        top = get_top(vector, sample_vec, t)
        retrieval = 0
        total_ret = 0
        evidence = 0
        total_evi = 0
        for i, row in samples.iterrows():
            docs = data.iloc[top[i]]
            sent = []
            sentid = []
            for j, d in docs.iterrows():
                for k, s in enumerate(d.sentences):
                    sent.append(s)
                    sentid.append(d.id + "_" + str(k))

            vec = tf_idf.transform(sent)
            top_evi = get_top(vec, test_vector[i])        

            true_evi = []
            if row.label != 2:
                total_ret += len(row.docsID)
                retrieval += len(set(docs.id.values.tolist()).intersection(row.docsID))

                p_id = set()
                for j in top_evi[0]:
                    p_id.add(sentid[j])
                
                true_evi = list(p_id.intersection(row.eviID))
                total_evi += len(row.eviID)
                evidence += len(true_evi)
        
        ret_accuracy[f'top_{t}'] = retrieval/total_ret
        evi_accuracy[f'top_{t}'] = evidence/total_evi
    return ret_accuracy, evi_accuracy
    
    


In [414]:
document_acc, sentence_acc = getRetrievalAccuracies(test, test_vector, [5, 10, 20, 50, 100])

In [415]:
retrieval_accuracy_df = pd.DataFrame([document_acc, sentence_acc])
keys = pd.Index(["tf-iDF Document Retrieval", "tf-iDF Sentence Retrieval"])
retrieval_accuracy_df.set_index(keys, inplace=True)
retrieval_accuracy_df

Unnamed: 0,top_5,top_10,top_20,top_50,top_100
tf-iDF Document Retrieval,0.8465,0.897345,0.929364,0.953821,0.96605
tf-iDF Sentence Retrieval,0.36034,0.351756,0.342559,0.333888,0.328896
