In [52]:
import json
import pandas as pd
import spacy
from ast import literal_eval


In [53]:
nlp = spacy.load("en_core_web_sm")


In [54]:
with open("nonstatistical.json", "r") as f:
    logs = json.load(f)

In [55]:
logs

[{'_id': {'$oid': '5e6e50751233a83c16e01f40'}, 'sentence': 'This is a test'},
 {'_id': {'$oid': '5e6e53138f6ae9f5e5e01f40'},
  'sentence': 'These students have fixed the check'},
 {'_id': {'$oid': '5e6e531c1233a83c16e01f43'},
  'sentence': 'These students have fixed the check'},
 {'_id': {'$oid': '5e6e53251233a83c16e01f45'},
  'sentence': 'These students have fixed the check'},
 {'_id': {'$oid': '5e6e53321233a83c16e01f46'},
  'sentence': 'pets can pass the virus'},
 {'_id': {'$oid': '5e6e53371233a83c16e01f47'},
  'sentence': 'stanford hospital'},
 {'_id': {'$oid': '5e6e53531233a83c16e01f48'},
  'sentence': 'These students have fixed the check'},
 {'_id': {'$oid': '5e6e536f1233a83c16e01f49'},
  'sentence': 'it will first infect the throat'},
 {'_id': {'$oid': '5e6e53731233a83c16e01f4a'},
  'sentence': 'it will first infect the throat'},
 {'_id': {'$oid': '5e6e538a4a7b365c36e01f40'}, 'sentence': 'This is a test'},
 {'_id': {'$oid': '5e6e55691233a83c16e01f4c'},
  'sentence': 'France had i

In [56]:
len(logs)

3439

In [57]:
akb_df = pd.read_csv("/Users/georgekaragiannis/Desktop/Cornell/research/akb_demo/data/ak_03312020_triples.csv",
                    converters={"subj": literal_eval, "pred": literal_eval, "obj": literal_eval})

In [63]:
def ngrams(split_sent, n):
    output = []
    for i in range(len(split_sent)-n+1):
        output.append(tuple(split_sent[i:i+n]))
    return output

def get_ngrams_list(sent, n):
    """
    returns a list of ngrams from 1 to n. Example if n=3, then return a list
    of unigrams, bigrams and trigrams
    """
    sent = sent.split(" ")
    # if we can't extract ngrams because sentence is not big enough
    if len(sent) < n:
        n = len(sent)
    ngrams_list = []
    ngrams_list.extend([ngrams(sent, n) for n in range(1,n+1)])
    return ngrams_list

In [64]:
def ak_row_in_text(ak_df, sent):
    """
    checks if there is a match in the text from an entry in the AKB.
    Returns a tuple (matched_sentence, ak_row) or None if no match is found
    """
    ngrams_list = get_ngrams_list(sent, 4)
    # holds the indexes of the AKB where we have a match for any ngram
    index_set_dict = {"subj": list(), "pred": list(), "obj": list()}
    for ngram in ngrams_list:
        subj_matches_df = ak_df[ak_df.subj.isin(ngram)]
        pred_matches_df = ak_df[ak_df.pred.isin(ngram)]
        obj_matches_df = ak_df[ak_df.obj.isin(ngram)]
        index_set_dict["subj"].extend(subj_matches_df.index.tolist())
        index_set_dict["pred"].extend(pred_matches_df.index.tolist())
        index_set_dict["obj"].extend(obj_matches_df.index.tolist())

    intersection_indexes = set(index_set_dict["subj"]).intersection(set(index_set_dict["pred"])).intersection(set(index_set_dict["obj"]))
    ret_df = pd.DataFrame(columns=list(ak_df.columns))
    if len(intersection_indexes) > 0:
        ret_df = ak_df.iloc[list(intersection_indexes)]
    return ret_df

In [65]:
def find_matches_from_text(text):
    """
    Find matches from the url text in the AKB. 
    Note: For now we do not use triples and check if any of the triples occur in the AKB.
    Returns a pandas row if the text is found, None otherwise
    """
    doc = nlp(text)
    matches = []
    for sent in doc.sents:
        match_df = ak_row_in_text(akb_df, sent.text.lower())
        if len(match_df.index) > 0:
            matches.append((sent.text, match_df))
    
    return matches


In [66]:
all_matches = []
for i, log in enumerate(logs):
    if i % 100 == 0:
        print(i)
    sentence = log["sentence"]
    matches_list = find_matches_from_text(sentence)
    all_matches.append(matches_list)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400


In [67]:
all_matches = [m for m in all_matches if len(m) > 0]

In [68]:
len(all_matches)

15

In [69]:
all_matches

[[('coronavirus is chinese virus',
                                                     text  \
   962  The new coronavirus is the “least deadly virus...   
   
                                                      url  \
   962  https://www.factcheck.org/2020/03/infographic-...   
   
                                                url_title rating            subj  \
   962  Infographic on Facebook Distorts Comparative F...  False  (coronavirus,)   
   
         pred       obj  
   962  (is,)  (virus,)  )],
 [('over a million people have died due to the coronavirus',
                                                     text  \
   106  People died on the streets in China due to the...   
   
                                                      url  \
   106  https://factcheck.afp.com/2014-photo-people-pa...   
   
                                                url_title rating       subj  \
   106  This is a 2014 photo of people participating i...  False  (people,)   
   
           

In [23]:
all_matches[0][0][0]

'over a million people have died due to the coronavirus'

In [29]:
all_matches[0][0][1].text

0    People died on the streets in China due to the...
Name: text, dtype: object

In [45]:
"virus" in "Coronavirus is milder than flu"

True