In [1]:
import pandas as pd

In [2]:
cases_path = "csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
deaths_path = "csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
recovered_path = "csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"

In [3]:
cases_df = pd.read_csv(cases_path)
deaths_df = pd.read_csv(deaths_path)
recovered_df = pd.read_csv(recovered_path)

In [4]:
cases_df

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20
0,,Afghanistan,33.000000,65.000000,0,0,0,0,0,0,...,21,22,22,22,24,24,40,40,74,84
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,51,55,59,64,70,76,89,104,123,146
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,54,60,74,87,90,139,201,230,264,302
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,2,39,39,53,75,88,113,133,164,188
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,0,0,0,0,1,2,2,3,3,3
5,,Antigua and Barbuda,17.060800,-61.796400,0,0,0,0,0,0,...,1,1,1,1,1,1,1,3,3,3
6,,Argentina,-38.416100,-63.616700,0,0,0,0,0,0,...,56,68,79,97,128,158,266,301,387,387
7,,Armenia,40.069100,45.038200,0,0,0,0,0,0,...,52,78,84,115,136,160,194,235,249,265
8,Australian Capital Territory,Australia,-35.473500,149.012400,0,0,0,0,0,0,...,2,2,3,4,6,9,19,32,39,39
9,New South Wales,Australia,-33.868800,151.209300,0,0,0,0,3,4,...,171,210,267,307,353,436,669,669,818,1029


In [10]:
for i, row in cases_df.iterrows():
    print(row[-1])
    max_num = int(row[-1])
    row_values = range(0, max_num)
    print(list(row_values))
    break

84
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83]


In [35]:
def construct_akb_entry_from_row(state, country, row_values, topic):
    # row_values is a list of ints containing all the values from all the dates
    # topic is "cases", "deaths", "recoveries"
    akb_entries_list = []
    for val in row_values:
        subj = country
        if isinstance(state, str):
            subj = " ".join([state, country])
        pred = "has"
        obj = " ".join([str(val), topic])
        sent = " ".join([subj, pred, obj])
        url = "https://coronavirus.jhu.edu/"
        url_title = "JHU Coronavirus Data"
        rating = "False"
        akb_entries_list.append({"text": sent, "url": url, "url_title": url_title, "rating": rating,
                                "subj": subj, "pred": pred, "obj": obj})
    return akb_entries_list

In [36]:
def construct_akb_df(df, topic):
    """
    from the original df, go through all the dates and construct akb entries 
    in the same format as we have in the web app.
    """
    ret_df = pd.DataFrame()
    for i, row in df.iterrows():
        try:
            row_max_num = int(row[-1])
        except ValueError:
            continue
        row_values = list(range(0, row_max_num))
        akb_entries = construct_akb_entry_from_row(row["Province/State"], row["Country/Region"], row_values, topic)
        if len(akb_entries) > 0:
            ret_df = ret_df.append(akb_entries)
    return ret_df

In [37]:
akb_cases_df = construct_akb_df(cases_df, "cases")
akb_deaths_df = construct_akb_df(deaths_df, "deaths")
akb_recovered_df = construct_akb_df(recovered_df, "recoveries")

In [38]:
akb_cases_df.sample(10)

Unnamed: 0,obj,pred,rating,subj,text,url,url_title
69177,69177 cases,has,False,Italy,Italy has 69177 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
24158,24158 cases,has,False,Spain,Spain has 24158 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
9538,9538 cases,has,False,France,France has 9538 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
44035,44035 cases,has,False,US,US has 44035 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
849,849 cases,has,False,Italy,Italy has 849 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
23100,23100 cases,has,False,Hubei China,Hubei China has 23100 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
55002,55002 cases,has,False,Hubei China,Hubei China has 55002 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
20266,20266 cases,has,False,Germany,Germany has 20266 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
163,163 cases,has,False,Croatia,Croatia has 163 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
845,845 cases,has,False,Saudi Arabia,Saudi Arabia has 845 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data


In [39]:
len(akb_cases_df)

467594

In [40]:
akb_deaths_df.sample(10)

Unnamed: 0,obj,pred,rating,subj,text,url,url_title
1098,1098 deaths,has,False,Hubei China,Hubei China has 1098 deaths,https://coronavirus.jhu.edu/,JHU Coronavirus Data
558,558 deaths,has,False,Hubei China,Hubei China has 558 deaths,https://coronavirus.jhu.edu/,JHU Coronavirus Data
2048,2048 deaths,has,False,Spain,Spain has 2048 deaths,https://coronavirus.jhu.edu/,JHU Coronavirus Data
347,347 deaths,has,False,Hubei China,Hubei China has 347 deaths,https://coronavirus.jhu.edu/,JHU Coronavirus Data
3164,3164 deaths,has,False,Italy,Italy has 3164 deaths,https://coronavirus.jhu.edu/,JHU Coronavirus Data
920,920 deaths,has,False,Iran,Iran has 920 deaths,https://coronavirus.jhu.edu/,JHU Coronavirus Data
1975,1975 deaths,has,False,Italy,Italy has 1975 deaths,https://coronavirus.jhu.edu/,JHU Coronavirus Data
1864,1864 deaths,has,False,Hubei China,Hubei China has 1864 deaths,https://coronavirus.jhu.edu/,JHU Coronavirus Data
3278,3278 deaths,has,False,Italy,Italy has 3278 deaths,https://coronavirus.jhu.edu/,JHU Coronavirus Data
324,324 deaths,has,False,US,US has 324 deaths,https://coronavirus.jhu.edu/,JHU Coronavirus Data


In [41]:
akb_recovered_df.sample(10)

Unnamed: 0,obj,pred,rating,subj,text,url,url_title
290,290 recoveries,has,False,Fujian China,Fujian China has 290 recoveries,https://coronavirus.jhu.edu/,JHU Coronavirus Data
50029,50029 recoveries,has,False,Hubei China,Hubei China has 50029 recoveries,https://coronavirus.jhu.edu/,JHU Coronavirus Data
7314,7314 recoveries,has,False,Hubei China,Hubei China has 7314 recoveries,https://coronavirus.jhu.edu/,JHU Coronavirus Data
11,11 recoveries,has,False,Canada,Canada has 11 recoveries,https://coronavirus.jhu.edu/,JHU Coronavirus Data
6880,6880 recoveries,has,False,Italy,Italy has 6880 recoveries,https://coronavirus.jhu.edu/,JHU Coronavirus Data
2374,2374 recoveries,has,False,France,France has 2374 recoveries,https://coronavirus.jhu.edu/,JHU Coronavirus Data
45164,45164 recoveries,has,False,Hubei China,Hubei China has 45164 recoveries,https://coronavirus.jhu.edu/,JHU Coronavirus Data
14158,14158 recoveries,has,False,Hubei China,Hubei China has 14158 recoveries,https://coronavirus.jhu.edu/,JHU Coronavirus Data
1060,1060 recoveries,has,False,"Korea, South","Korea, South has 1060 recoveries",https://coronavirus.jhu.edu/,JHU Coronavirus Data
4930,4930 recoveries,has,False,Iran,Iran has 4930 recoveries,https://coronavirus.jhu.edu/,JHU Coronavirus Data


In [42]:
len(akb_cases_df)

467594

In [43]:
all_df = pd.concat([akb_cases_df, akb_deaths_df, akb_recovered_df])

In [44]:
all_df = all_df.drop_duplicates()

In [45]:
all_df

Unnamed: 0,obj,pred,rating,subj,text,url,url_title
0,0 cases,has,False,Afghanistan,Afghanistan has 0 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
1,1 cases,has,False,Afghanistan,Afghanistan has 1 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
2,2 cases,has,False,Afghanistan,Afghanistan has 2 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
3,3 cases,has,False,Afghanistan,Afghanistan has 3 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
4,4 cases,has,False,Afghanistan,Afghanistan has 4 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
5,5 cases,has,False,Afghanistan,Afghanistan has 5 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
6,6 cases,has,False,Afghanistan,Afghanistan has 6 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
7,7 cases,has,False,Afghanistan,Afghanistan has 7 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
8,8 cases,has,False,Afghanistan,Afghanistan has 8 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data
9,9 cases,has,False,Afghanistan,Afghanistan has 9 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data


In [46]:
all_df.to_csv("/Users/georgekaragiannis/Desktop/Cornell/research/akb_demo/data/jhu_timeseries_akb_03282020.csv", index=False)

In [27]:
# now merge the above data with the data obtained from google api

In [47]:
INIT_AK_DF_PATH = "/Users/georgekaragiannis/Desktop/Cornell/research/akb_demo/data/ak_03242020_triples.csv"

In [48]:
ak_df = pd.read_csv(INIT_AK_DF_PATH)

In [49]:
ak_df

Unnamed: 0,text,url,url_title,rating,subj,pred,obj
0,“China is surrounded by American military labs...,https://www.polygraph.info/a/russia-coronaviru...,Russian Media Spew US Coronavirus Conspiracies...,Misleading,U.S. consulate,is in,Wuhan
1,“The new fad disease called the ‘coronavirus’ ...,https://www.factcheck.org/2020/01/social-media...,Social Media Posts Spread Bogus Coronavirus Co...,False,patent,was,filed
2,“The new fad disease called the ‘coronavirus’ ...,https://www.factcheck.org/2020/01/social-media...,Social Media Posts Spread Bogus Coronavirus Co...,False,coronavirus,is,headlines
3,The CDC created and patented the coronavirus f...,https://factcheck.afp.com/coronavirus-plaguing...,,False,CDC,created,coronavirus
4,The Coronavirus is patented by the US,https://www.boomlive.in/health/coronavirus-pat...,Coronavirus Patented? Why Social Media Posts A...,Misleading,Coronavirus,is patented by,US
5,"“CORONAVIRUS: Reports of 10,000 DEAD in Wuhan,...",https://www.factcheck.org/2020/01/misinformati...,Misinformation on Coronavirus Death Toll,False,"10,000 DEAD",is in,Wuhan
6,“[A]uthorities have identified that the intern...,https://www.factcheck.org/2020/01/comedian-sam...,Comedian Sam Hyde Not 'Behind' Spread of Coron...,False,international chemical-warfare terrorist Samue...,is behind,deadly China coronavirus
7,A post seeks to link a Saudi Arabian MERS vacc...,https://factcheck.aap.com.au/social-media-clai...,Saudi MERS vaccine study is not related to 202...,"False - Based on the evidence, AAP FactCheck f...",post,link,Saudi Arabian MERS vaccine study
8,“A husband and wife Chinese spy team were rece...,https://www.factcheck.org/2020/01/coronavirus-...,Coronavirus Wasn't Sent by 'Spy' From Canada,False,husband spy team,were,recently removed
9,Video Shows Effects Of Coronavirus,https://www.boomlive.in/health/false-this-vide...,False: This Video Does Not Show Effects Of Cor...,False,Video,Shows,Effects Of Coronavirus


In [54]:
set(ak_df.columns) == set(all_df.columns)

True

In [57]:
new_akb_df = pd.concat([ak_df, all_df], ignore_index=True, sort=False)

In [58]:
new_akb_df

Unnamed: 0,text,url,url_title,rating,subj,pred,obj
0,“China is surrounded by American military labs...,https://www.polygraph.info/a/russia-coronaviru...,Russian Media Spew US Coronavirus Conspiracies...,Misleading,U.S. consulate,is in,Wuhan
1,“The new fad disease called the ‘coronavirus’ ...,https://www.factcheck.org/2020/01/social-media...,Social Media Posts Spread Bogus Coronavirus Co...,False,patent,was,filed
2,“The new fad disease called the ‘coronavirus’ ...,https://www.factcheck.org/2020/01/social-media...,Social Media Posts Spread Bogus Coronavirus Co...,False,coronavirus,is,headlines
3,The CDC created and patented the coronavirus f...,https://factcheck.afp.com/coronavirus-plaguing...,,False,CDC,created,coronavirus
4,The Coronavirus is patented by the US,https://www.boomlive.in/health/coronavirus-pat...,Coronavirus Patented? Why Social Media Posts A...,Misleading,Coronavirus,is patented by,US
5,"“CORONAVIRUS: Reports of 10,000 DEAD in Wuhan,...",https://www.factcheck.org/2020/01/misinformati...,Misinformation on Coronavirus Death Toll,False,"10,000 DEAD",is in,Wuhan
6,“[A]uthorities have identified that the intern...,https://www.factcheck.org/2020/01/comedian-sam...,Comedian Sam Hyde Not 'Behind' Spread of Coron...,False,international chemical-warfare terrorist Samue...,is behind,deadly China coronavirus
7,A post seeks to link a Saudi Arabian MERS vacc...,https://factcheck.aap.com.au/social-media-clai...,Saudi MERS vaccine study is not related to 202...,"False - Based on the evidence, AAP FactCheck f...",post,link,Saudi Arabian MERS vaccine study
8,“A husband and wife Chinese spy team were rece...,https://www.factcheck.org/2020/01/coronavirus-...,Coronavirus Wasn't Sent by 'Spy' From Canada,False,husband spy team,were,recently removed
9,Video Shows Effects Of Coronavirus,https://www.boomlive.in/health/false-this-vide...,False: This Video Does Not Show Effects Of Cor...,False,Video,Shows,Effects Of Coronavirus


In [59]:
new_akb_df.to_csv("/Users/georgekaragiannis/Desktop/Cornell/research/akb_demo/data/ak_jhu_03242020_triples.csv", index=False)

In [130]:
s = "Greece has 10 cases"

In [131]:
list_s = s.split(" ")

In [105]:
set(s.split(" "))

{'China', 'coronavirus', 'created', 'in', 'was'}

In [106]:
set_s = [set(s.split(" "))]

In [107]:
triple = set(["Greece", "has", "10 cases"])

In [138]:
ret_df = pd.DataFrame()
for ngram in ngrams_l:
    temp = new_akb_df[(new_akb_df.subj.isin(list_s)) | (new_akb_df.pred.isin(list_s)) | (new_akb_df.obj.isin(list_s))]
    ret_df = ret_df.append(temp)

In [139]:
ret_df

Unnamed: 0,text,url,url_title,rating,subj,pred,obj
30,Broiler Chicken has Coronavirus,https://www.boomlive.in/health/false-coronavir...,False: Coronavirus Found In Broiler Chickens,False,Broiler Chicken,has,Coronavirus
79,"""[T]hey don't even have a test kit [for COVID-...",https://www.factcheck.org/2020/03/the-facts-on...,The Facts on Coronavirus Testing,False,someone,has,disease
80,"""To keep new cases from entering our shores, w...",https://www.factcheck.org/2020/03/factchecking...,FactChecking Trump's Coronavirus Address,False. Not All Travel.,we,keep,cases
81,A novel coronavirus patient has been identifie...,https://factcheck.afp.com/indian-health-author...,Indian health authorities dismiss hoax report ...,False,coronavirus patient,has,has identified in state of Bihar 's Purnea dis...
191,Coronavirus Vaccine has been created in Israel,https://thelogicalindian.com/fact-check/corona...,"Fact Check: No, Vaccine For Novel Coronavirus ...",Misleading,Coronavirus Vaccine,has,has created in Israel
294,Afghanistan has 0 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data,False,Afghanistan,has,0 cases
295,Afghanistan has 1 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data,False,Afghanistan,has,1 cases
296,Afghanistan has 2 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data,False,Afghanistan,has,2 cases
297,Afghanistan has 3 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data,False,Afghanistan,has,3 cases
298,Afghanistan has 4 cases,https://coronavirus.jhu.edu/,JHU Coronavirus Data,False,Afghanistan,has,4 cases


In [135]:
def ngrams(sent, n):
    sent = sent.split(' ')
    output = []
    for i in range(len(sent)-n+1):
        output.append(sent[i:i+n])
    return output

In [136]:
ngrams_l = []
ngrams_l.extend([ngrams(s, n) for n in [1,2,3,4]])

In [137]:
for ngram in ngrams_l:
    print(ngram)

[['Greece'], ['has'], ['10'], ['cases']]
[['Greece', 'has'], ['has', '10'], ['10', 'cases']]
[['Greece', 'has', '10'], ['has', '10', 'cases']]
[['Greece', 'has', '10', 'cases']]


In [126]:
ngrams(s, 1)

[['coronavirus'], ['was'], ['created'], ['in'], ['China']]