# Import packages

In [8]:
import pandas as pd
from tqdm import tqdm
import os
import nltk
import numpy as np


# Get Treatment PMIDs

In [9]:
def get_csv(file):
    path = "../litcovid_evaluation_datasets/topic_assignment/"
    return pd.read_csv(path + file + ".csv", header=None)

#get dfs and format
topics_orig = pd.concat([get_csv(f) for f in ["test", "train", "valid"]])
topics_orig.columns = ["pmid", "topic"]

#make a copy before dropping nans
topics = topics_orig.copy()
topics.dropna(inplace=True)
print("There are {} unlabelled files.".format(topics_orig.shape[0] - topics.shape[0]))

#get "treatment" pmids
treatment = topics[topics.topic.str.contains("Treat")]
print("There are {} \"Treatment\" files.".format(treatment.shape[0]))

There are 7137 unlabelled files.
There are 9402 "Treatment" files.


In [10]:
treatment.to_csv("treatment_pmids.csv")

# Select JSON files with Treatment PMIDs

In [11]:
#read in json
test_json = '../litcovid2BioCJSON.json'
df = pd.read_json(test_json)

In [12]:
#format df
df.drop(df.head(1).index, inplace=True)
df = df.T

In [13]:
#get pmids
df.columns= ["data"]
df["pmid"] = [df.iloc[i].values[0].get("pmid") if type(df.iloc[i].values[0]) == dict else df.iloc[i].values[0] for i in range(df.shape[0])]
df.head()

Unnamed: 0,data,pmid
0,"{'_id': '32911311|None', 'id': '32911311', 'in...",32911311
1,"{'_id': '32525830|None', 'id': '32525830', 'in...",32525830
2,"{'_id': '32736412|None', 'id': '32736412', 'in...",32736412
3,"{'_id': '32807526|None', 'id': '32807526', 'in...",32807526
4,"{'_id': '32292259|PMC7118592', 'id': '7118592'...",32292259


In [16]:
#get treatment pmids
treatment_pmids = df[df.pmid.isin(treatment.pmid.values)]

# Extract abstracts and write to separate .txts

In [80]:
#save txt files to folder
folder = "../LitCovid_Treatment/"

#iterate through pmids 
for i in range(treatment_pmids.shape[0]):
    name = folder + str(treatment_pmids.pmid.iloc[i]) + ".txt"
    
    #get abstract text
    passages = treatment_pmids.data.iloc[i].get("passages")

    #no headers:
    #text = [passages[i].get("text") for i in range(1, len(passages)) if (passages[i].get("infons").get("type")) in ["abstract", "Abstract", "ABSTRACT"]]
    
    #headers: 
    text = [passages[i].get("text") for i in range(1, len(passages)) if (passages[i].get("infons").get("section")) in ["abstract", "Abstract", "ABSTRACT"]]

    #write to file only if data is valid
    if len(text) > 0 and text != ['']:
        print(name)
        f = open(name, "w")
        f.write(" ".join(text))
        f.close()

../LitCovid_Treatment/32361738.txt
../LitCovid_Treatment/32611911.txt
../LitCovid_Treatment/32376397.txt
../LitCovid_Treatment/32694201.txt
../LitCovid_Treatment/32762561.txt
../LitCovid_Treatment/32761993.txt
../LitCovid_Treatment/32483314.txt
../LitCovid_Treatment/32072569.txt
../LitCovid_Treatment/32665040.txt
../LitCovid_Treatment/32681716.txt
../LitCovid_Treatment/32775071.txt
../LitCovid_Treatment/32473418.txt
../LitCovid_Treatment/32632359.txt
../LitCovid_Treatment/32239125.txt
../LitCovid_Treatment/32667578.txt
../LitCovid_Treatment/32611457.txt
../LitCovid_Treatment/32726800.txt
../LitCovid_Treatment/32613004.txt
../LitCovid_Treatment/32568620.txt
../LitCovid_Treatment/32674732.txt
../LitCovid_Treatment/32696443.txt
../LitCovid_Treatment/32401670.txt
../LitCovid_Treatment/32769822.txt
../LitCovid_Treatment/32674450.txt
../LitCovid_Treatment/32757470.txt
../LitCovid_Treatment/32722950.txt
../LitCovid_Treatment/32458459.txt
../LitCovid_Treatment/32497323.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32569491.txt
../LitCovid_Treatment/32400362.txt
../LitCovid_Treatment/32534337.txt
../LitCovid_Treatment/32653420.txt
../LitCovid_Treatment/32717116.txt
../LitCovid_Treatment/32761396.txt
../LitCovid_Treatment/32415260.txt
../LitCovid_Treatment/32714336.txt
../LitCovid_Treatment/32586399.txt
../LitCovid_Treatment/32542212.txt
../LitCovid_Treatment/32763298.txt
../LitCovid_Treatment/32493734.txt
../LitCovid_Treatment/32720486.txt
../LitCovid_Treatment/32360608.txt
../LitCovid_Treatment/32707096.txt
../LitCovid_Treatment/32516797.txt
../LitCovid_Treatment/32450761.txt
../LitCovid_Treatment/32645660.txt
../LitCovid_Treatment/32378805.txt
../LitCovid_Treatment/32251794.txt
../LitCovid_Treatment/32387470.txt
../LitCovid_Treatment/32558193.txt
../LitCovid_Treatment/32376583.txt
../LitCovid_Treatment/32648973.txt
../LitCovid_Treatment/32709055.txt
../LitCovid_Treatment/32750366.txt
../LitCovid_Treatment/32610587.txt
../LitCovid_Treatment/32512383.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32339250.txt
../LitCovid_Treatment/32640331.txt
../LitCovid_Treatment/32353740.txt
../LitCovid_Treatment/32526093.txt
../LitCovid_Treatment/32409498.txt
../LitCovid_Treatment/32362504.txt
../LitCovid_Treatment/32506122.txt
../LitCovid_Treatment/32438454.txt
../LitCovid_Treatment/32668443.txt
../LitCovid_Treatment/32392332.txt
../LitCovid_Treatment/32381264.txt
../LitCovid_Treatment/32434337.txt
../LitCovid_Treatment/32504751.txt
../LitCovid_Treatment/32763661.txt
../LitCovid_Treatment/32768580.txt
../LitCovid_Treatment/32444482.txt
../LitCovid_Treatment/32761894.txt
../LitCovid_Treatment/32710269.txt
../LitCovid_Treatment/32349115.txt
../LitCovid_Treatment/32749705.txt
../LitCovid_Treatment/32437939.txt
../LitCovid_Treatment/32344202.txt
../LitCovid_Treatment/32639183.txt
../LitCovid_Treatment/32614392.txt
../LitCovid_Treatment/32388486.txt
../LitCovid_Treatment/32367852.txt
../LitCovid_Treatment/32576386.txt
../LitCovid_Treatment/32737124.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32484930.txt
../LitCovid_Treatment/32587093.txt
../LitCovid_Treatment/32705981.txt
../LitCovid_Treatment/32767920.txt
../LitCovid_Treatment/32638507.txt
../LitCovid_Treatment/32335336.txt
../LitCovid_Treatment/32171866.txt
../LitCovid_Treatment/32574336.txt
../LitCovid_Treatment/32534129.txt
../LitCovid_Treatment/32303590.txt
../LitCovid_Treatment/32398343.txt
../LitCovid_Treatment/32090448.txt
../LitCovid_Treatment/32344011.txt
../LitCovid_Treatment/32770797.txt
../LitCovid_Treatment/32385672.txt
../LitCovid_Treatment/32623810.txt
../LitCovid_Treatment/32758363.txt
../LitCovid_Treatment/32297985.txt
../LitCovid_Treatment/32371563.txt
../LitCovid_Treatment/32675285.txt
../LitCovid_Treatment/32405102.txt
../LitCovid_Treatment/32663827.txt
../LitCovid_Treatment/32748211.txt
../LitCovid_Treatment/32601020.txt
../LitCovid_Treatment/32592406.txt
../LitCovid_Treatment/32686248.txt
../LitCovid_Treatment/32634818.txt
../LitCovid_Treatment/32726724.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32588493.txt
../LitCovid_Treatment/32601577.txt
../LitCovid_Treatment/32289117.txt
../LitCovid_Treatment/32399192.txt
../LitCovid_Treatment/32422076.txt
../LitCovid_Treatment/32554251.txt
../LitCovid_Treatment/32758037.txt
../LitCovid_Treatment/32550040.txt
../LitCovid_Treatment/32087621.txt
../LitCovid_Treatment/32639631.txt
../LitCovid_Treatment/32727719.txt
../LitCovid_Treatment/32440665.txt
../LitCovid_Treatment/32697870.txt
../LitCovid_Treatment/32574196.txt
../LitCovid_Treatment/32754570.txt
../LitCovid_Treatment/32568604.txt
../LitCovid_Treatment/32681497.txt
../LitCovid_Treatment/32643469.txt
../LitCovid_Treatment/32306860.txt
../LitCovid_Treatment/32641471.txt
../LitCovid_Treatment/32753646.txt
../LitCovid_Treatment/32660065.txt
../LitCovid_Treatment/32353398.txt
../LitCovid_Treatment/32522067.txt
../LitCovid_Treatment/32198275.txt
../LitCovid_Treatment/32572128.txt
../LitCovid_Treatment/32610879.txt
../LitCovid_Treatment/32574287.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32410504.txt
../LitCovid_Treatment/32413619.txt
../LitCovid_Treatment/32679766.txt
../LitCovid_Treatment/32702463.txt
../LitCovid_Treatment/32645276.txt
../LitCovid_Treatment/32348485.txt
../LitCovid_Treatment/32536632.txt
../LitCovid_Treatment/32425270.txt
../LitCovid_Treatment/32616588.txt
../LitCovid_Treatment/32517353.txt
../LitCovid_Treatment/32523611.txt
../LitCovid_Treatment/32718895.txt
../LitCovid_Treatment/32543892.txt
../LitCovid_Treatment/32669979.txt
../LitCovid_Treatment/32713282.txt
../LitCovid_Treatment/32745993.txt
../LitCovid_Treatment/32682787.txt
../LitCovid_Treatment/32619190.txt
../LitCovid_Treatment/32471876.txt
../LitCovid_Treatment/32292300.txt
../LitCovid_Treatment/32688107.txt
../LitCovid_Treatment/32459574.txt
../LitCovid_Treatment/32640723.txt
../LitCovid_Treatment/32556942.txt
../LitCovid_Treatment/32553873.txt
../LitCovid_Treatment/32098302.txt
../LitCovid_Treatment/32526370.txt
../LitCovid_Treatment/32475019.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32446798.txt
../LitCovid_Treatment/32519779.txt
../LitCovid_Treatment/32416679.txt
../LitCovid_Treatment/32492084.txt
../LitCovid_Treatment/32496734.txt
../LitCovid_Treatment/32497632.txt
../LitCovid_Treatment/32707288.txt
../LitCovid_Treatment/32434254.txt
../LitCovid_Treatment/32332185.txt
../LitCovid_Treatment/32665127.txt
../LitCovid_Treatment/32719672.txt
../LitCovid_Treatment/32641762.txt
../LitCovid_Treatment/32610445.txt
../LitCovid_Treatment/32587094.txt
../LitCovid_Treatment/32587077.txt
../LitCovid_Treatment/32763058.txt
../LitCovid_Treatment/32523151.txt
../LitCovid_Treatment/32776556.txt
../LitCovid_Treatment/32633831.txt
../LitCovid_Treatment/32350818.txt
../LitCovid_Treatment/32773100.txt
../LitCovid_Treatment/32506865.txt
../LitCovid_Treatment/32665234.txt
../LitCovid_Treatment/32408391.txt
../LitCovid_Treatment/32515982.txt
../LitCovid_Treatment/32463562.txt
../LitCovid_Treatment/32727845.txt
../LitCovid_Treatment/32653469.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32522874.txt
../LitCovid_Treatment/32305402.txt
../LitCovid_Treatment/32564623.txt
../LitCovid_Treatment/32267499.txt
../LitCovid_Treatment/32755466.txt
../LitCovid_Treatment/31996494.txt
../LitCovid_Treatment/32612614.txt
../LitCovid_Treatment/32627126.txt
../LitCovid_Treatment/32719749.txt
../LitCovid_Treatment/32685882.txt
../LitCovid_Treatment/32605683.txt
../LitCovid_Treatment/32729248.txt
../LitCovid_Treatment/32392017.txt
../LitCovid_Treatment/32749010.txt
../LitCovid_Treatment/32423908.txt
../LitCovid_Treatment/32485437.txt
../LitCovid_Treatment/32628049.txt
../LitCovid_Treatment/32742475.txt
../LitCovid_Treatment/32565309.txt
../LitCovid_Treatment/32460632.txt
../LitCovid_Treatment/32386440.txt
../LitCovid_Treatment/32563194.txt
../LitCovid_Treatment/32613637.txt
../LitCovid_Treatment/32324209.txt
../LitCovid_Treatment/32693436.txt
../LitCovid_Treatment/32437944.txt
../LitCovid_Treatment/32298029.txt
../LitCovid_Treatment/32726005.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32725382.txt
../LitCovid_Treatment/32708205.txt
../LitCovid_Treatment/32607652.txt
../LitCovid_Treatment/32597627.txt
../LitCovid_Treatment/32762171.txt
../LitCovid_Treatment/32397915.txt
../LitCovid_Treatment/32744511.txt
../LitCovid_Treatment/32654555.txt
../LitCovid_Treatment/32745604.txt
../LitCovid_Treatment/32286678.txt
../LitCovid_Treatment/32564028.txt
../LitCovid_Treatment/32653224.txt
../LitCovid_Treatment/32531325.txt
../LitCovid_Treatment/32385628.txt
../LitCovid_Treatment/32691697.txt
../LitCovid_Treatment/32229706.txt
../LitCovid_Treatment/32540792.txt
../LitCovid_Treatment/32575140.txt
../LitCovid_Treatment/32752951.txt
../LitCovid_Treatment/32541352.txt
../LitCovid_Treatment/32765943.txt
../LitCovid_Treatment/32228825.txt
../LitCovid_Treatment/32492560.txt
../LitCovid_Treatment/32767687.txt
../LitCovid_Treatment/32372695.txt
../LitCovid_Treatment/32416769.txt
../LitCovid_Treatment/32423024.txt
../LitCovid_Treatment/32481460.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32692580.txt
../LitCovid_Treatment/32358691.txt
../LitCovid_Treatment/32383183.txt
../LitCovid_Treatment/32737467.txt
../LitCovid_Treatment/32579021.txt
../LitCovid_Treatment/32385891.txt
../LitCovid_Treatment/32653518.txt
../LitCovid_Treatment/32695324.txt
../LitCovid_Treatment/32540271.txt
../LitCovid_Treatment/32485970.txt
../LitCovid_Treatment/32419564.txt
../LitCovid_Treatment/32715661.txt
../LitCovid_Treatment/32620455.txt
../LitCovid_Treatment/32218915.txt
../LitCovid_Treatment/32424845.txt
../LitCovid_Treatment/32324362.txt
../LitCovid_Treatment/32654489.txt
../LitCovid_Treatment/32634603.txt
../LitCovid_Treatment/32675033.txt
../LitCovid_Treatment/32351873.txt
../LitCovid_Treatment/32530284.txt
../LitCovid_Treatment/32645661.txt
../LitCovid_Treatment/32198291.txt
../LitCovid_Treatment/32733404.txt
../LitCovid_Treatment/32654006.txt
../LitCovid_Treatment/32723107.txt
../LitCovid_Treatment/32499983.txt
../LitCovid_Treatment/32656303.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32531138.txt
../LitCovid_Treatment/32718719.txt
../LitCovid_Treatment/32530389.txt
../LitCovid_Treatment/32718669.txt
../LitCovid_Treatment/32743014.txt
../LitCovid_Treatment/32576347.txt
../LitCovid_Treatment/32652521.txt
../LitCovid_Treatment/32613257.txt
../LitCovid_Treatment/32677591.txt
../LitCovid_Treatment/32677545.txt
../LitCovid_Treatment/32767260.txt
../LitCovid_Treatment/32672431.txt
../LitCovid_Treatment/32553503.txt
../LitCovid_Treatment/32763662.txt
../LitCovid_Treatment/32327746.txt
../LitCovid_Treatment/32691002.txt
../LitCovid_Treatment/32651736.txt
../LitCovid_Treatment/32558641.txt
../LitCovid_Treatment/32461321.txt
../LitCovid_Treatment/32663912.txt
../LitCovid_Treatment/32674481.txt
../LitCovid_Treatment/32392389.txt
../LitCovid_Treatment/32665809.txt
../LitCovid_Treatment/32585284.txt
../LitCovid_Treatment/32418327.txt
../LitCovid_Treatment/32568618.txt
../LitCovid_Treatment/32243668.txt
../LitCovid_Treatment/32396903.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32701376.txt
../LitCovid_Treatment/32510142.txt
../LitCovid_Treatment/32384820.txt
../LitCovid_Treatment/32732325.txt
../LitCovid_Treatment/32776551.txt
../LitCovid_Treatment/32348598.txt
../LitCovid_Treatment/32362644.txt
../LitCovid_Treatment/32629875.txt
../LitCovid_Treatment/32699166.txt
../LitCovid_Treatment/32219626.txt
../LitCovid_Treatment/32590326.txt
../LitCovid_Treatment/32497778.txt
../LitCovid_Treatment/32619134.txt
../LitCovid_Treatment/32379923.txt
../LitCovid_Treatment/32522282.txt
../LitCovid_Treatment/32769892.txt
../LitCovid_Treatment/32544304.txt
../LitCovid_Treatment/32759504.txt
../LitCovid_Treatment/32727941.txt
../LitCovid_Treatment/32557555.txt
../LitCovid_Treatment/32666588.txt
../LitCovid_Treatment/32637999.txt
../LitCovid_Treatment/32265310.txt
../LitCovid_Treatment/32754599.txt
../LitCovid_Treatment/32754626.txt
../LitCovid_Treatment/32320852.txt
../LitCovid_Treatment/32643158.txt
../LitCovid_Treatment/32734828.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32720082.txt
../LitCovid_Treatment/32684227.txt
../LitCovid_Treatment/32685883.txt
../LitCovid_Treatment/32746637.txt
../LitCovid_Treatment/32716553.txt
../LitCovid_Treatment/32686781.txt
../LitCovid_Treatment/32621841.txt
../LitCovid_Treatment/32526275.txt
../LitCovid_Treatment/32467113.txt
../LitCovid_Treatment/32514935.txt
../LitCovid_Treatment/32635752.txt
../LitCovid_Treatment/32589189.txt
../LitCovid_Treatment/32619279.txt
../LitCovid_Treatment/32630746.txt
../LitCovid_Treatment/32493475.txt
../LitCovid_Treatment/32374264.txt
../LitCovid_Treatment/32696001.txt
../LitCovid_Treatment/32754119.txt
../LitCovid_Treatment/32164089.txt
../LitCovid_Treatment/32384020.txt
../LitCovid_Treatment/32113704.txt
../LitCovid_Treatment/32383171.txt
../LitCovid_Treatment/32491075.txt
../LitCovid_Treatment/32556323.txt
../LitCovid_Treatment/32447102.txt
../LitCovid_Treatment/32641121.txt
../LitCovid_Treatment/32478959.txt
../LitCovid_Treatment/32597954.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32435870.txt
../LitCovid_Treatment/32552642.txt
../LitCovid_Treatment/32297796.txt
../LitCovid_Treatment/32081636.txt
../LitCovid_Treatment/32411496.txt
../LitCovid_Treatment/32376593.txt
../LitCovid_Treatment/32493812.txt
../LitCovid_Treatment/32755835.txt
../LitCovid_Treatment/32525600.txt
../LitCovid_Treatment/32478523.txt
../LitCovid_Treatment/32251618.txt
../LitCovid_Treatment/32546598.txt
../LitCovid_Treatment/32773105.txt
../LitCovid_Treatment/32037389.txt
../LitCovid_Treatment/32712607.txt
../LitCovid_Treatment/32705976.txt
../LitCovid_Treatment/32387756.txt
../LitCovid_Treatment/32492211.txt
../LitCovid_Treatment/32722979.txt
../LitCovid_Treatment/32583808.txt
../LitCovid_Treatment/32638509.txt
../LitCovid_Treatment/32505217.txt
../LitCovid_Treatment/32286790.txt
../LitCovid_Treatment/32216698.txt
../LitCovid_Treatment/32618402.txt
../LitCovid_Treatment/32434806.txt
../LitCovid_Treatment/32671351.txt
../LitCovid_Treatment/32755302.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32075364.txt
../LitCovid_Treatment/32463239.txt
../LitCovid_Treatment/32444366.txt
../LitCovid_Treatment/32680942.txt
../LitCovid_Treatment/32597005.txt
../LitCovid_Treatment/32620366.txt
../LitCovid_Treatment/32712300.txt
../LitCovid_Treatment/32271490.txt
../LitCovid_Treatment/32322402.txt
../LitCovid_Treatment/32306656.txt
../LitCovid_Treatment/32376101.txt
../LitCovid_Treatment/32665786.txt
../LitCovid_Treatment/32459529.txt
../LitCovid_Treatment/32247927.txt
../LitCovid_Treatment/32744052.txt
../LitCovid_Treatment/32616597.txt
../LitCovid_Treatment/32051072.txt
../LitCovid_Treatment/32544372.txt
../LitCovid_Treatment/32562480.txt
../LitCovid_Treatment/32372051.txt
../LitCovid_Treatment/32429722.txt
../LitCovid_Treatment/32562276.txt
../LitCovid_Treatment/32744477.txt
../LitCovid_Treatment/32750442.txt
../LitCovid_Treatment/32354113.txt
../LitCovid_Treatment/32662915.txt
../LitCovid_Treatment/32720698.txt
../LitCovid_Treatment/32690352.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32589756.txt
../LitCovid_Treatment/32542131.txt
../LitCovid_Treatment/32615866.txt
../LitCovid_Treatment/32320506.txt
../LitCovid_Treatment/32654378.txt
../LitCovid_Treatment/32614258.txt
../LitCovid_Treatment/32405156.txt
../LitCovid_Treatment/32482977.txt
../LitCovid_Treatment/32604223.txt
../LitCovid_Treatment/32341111.txt
../LitCovid_Treatment/32573788.txt
../LitCovid_Treatment/32769232.txt
../LitCovid_Treatment/32734777.txt
../LitCovid_Treatment/32600125.txt
../LitCovid_Treatment/32613660.txt
../LitCovid_Treatment/32601704.txt
../LitCovid_Treatment/32725449.txt
../LitCovid_Treatment/32717771.txt
../LitCovid_Treatment/32739471.txt
../LitCovid_Treatment/32356926.txt
../LitCovid_Treatment/32526460.txt
../LitCovid_Treatment/32475144.txt
../LitCovid_Treatment/32632960.txt
../LitCovid_Treatment/32597503.txt
../LitCovid_Treatment/32581514.txt
../LitCovid_Treatment/32679426.txt
../LitCovid_Treatment/32618700.txt
../LitCovid_Treatment/32300516.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32119961.txt
../LitCovid_Treatment/32693665.txt
../LitCovid_Treatment/32576345.txt
../LitCovid_Treatment/32664113.txt
../LitCovid_Treatment/32640179.txt
../LitCovid_Treatment/32750889.txt
../LitCovid_Treatment/32678431.txt
../LitCovid_Treatment/32470851.txt
../LitCovid_Treatment/32687917.txt
../LitCovid_Treatment/32313883.txt
../LitCovid_Treatment/32672799.txt
../LitCovid_Treatment/32645478.txt
../LitCovid_Treatment/32764120.txt
../LitCovid_Treatment/32405117.txt
../LitCovid_Treatment/32356908.txt
../LitCovid_Treatment/32532764.txt
../LitCovid_Treatment/32318975.txt
../LitCovid_Treatment/32384078.txt
../LitCovid_Treatment/32628003.txt
../LitCovid_Treatment/32318865.txt
../LitCovid_Treatment/32773414.txt
../LitCovid_Treatment/32693298.txt
../LitCovid_Treatment/32574309.txt
../LitCovid_Treatment/32679155.txt
../LitCovid_Treatment/32763754.txt
../LitCovid_Treatment/32326602.txt
../LitCovid_Treatment/32639466.txt
../LitCovid_Treatment/32629085.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32388331.txt
../LitCovid_Treatment/32617855.txt
../LitCovid_Treatment/32662982.txt
../LitCovid_Treatment/32330817.txt
../LitCovid_Treatment/32735768.txt
../LitCovid_Treatment/32748333.txt
../LitCovid_Treatment/32730844.txt
../LitCovid_Treatment/32622796.txt
../LitCovid_Treatment/32772313.txt
../LitCovid_Treatment/32321905.txt
../LitCovid_Treatment/32482788.txt
../LitCovid_Treatment/32713863.txt
../LitCovid_Treatment/32735546.txt
../LitCovid_Treatment/32599245.txt
../LitCovid_Treatment/32611087.txt
../LitCovid_Treatment/32636114.txt
../LitCovid_Treatment/32719738.txt
../LitCovid_Treatment/32618699.txt
../LitCovid_Treatment/32708322.txt
../LitCovid_Treatment/32425152.txt
../LitCovid_Treatment/32573711.txt
../LitCovid_Treatment/32723801.txt
../LitCovid_Treatment/32215760.txt
../LitCovid_Treatment/32248966.txt
../LitCovid_Treatment/32558620.txt
../LitCovid_Treatment/32670298.txt
../LitCovid_Treatment/32397275.txt
../LitCovid_Treatment/32754004.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32098422.txt
../LitCovid_Treatment/32678866.txt
../LitCovid_Treatment/32305501.txt
../LitCovid_Treatment/32458400.txt
../LitCovid_Treatment/32350104.txt
../LitCovid_Treatment/32661497.txt
../LitCovid_Treatment/32760887.txt
../LitCovid_Treatment/32699874.txt
../LitCovid_Treatment/32682440.txt
../LitCovid_Treatment/32599237.txt
../LitCovid_Treatment/32603829.txt
../LitCovid_Treatment/32578982.txt
../LitCovid_Treatment/32556143.txt
../LitCovid_Treatment/32537957.txt
../LitCovid_Treatment/32710998.txt
../LitCovid_Treatment/32691005.txt
../LitCovid_Treatment/32405780.txt
../LitCovid_Treatment/32687406.txt
../LitCovid_Treatment/32243267.txt
../LitCovid_Treatment/32703743.txt
../LitCovid_Treatment/32654422.txt
../LitCovid_Treatment/32762250.txt
../LitCovid_Treatment/32385146.txt
../LitCovid_Treatment/32757420.txt
../LitCovid_Treatment/32564984.txt
../LitCovid_Treatment/32398026.txt
../LitCovid_Treatment/32574236.txt
../LitCovid_Treatment/32683576.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32676080.txt
../LitCovid_Treatment/32620144.txt
../LitCovid_Treatment/32645207.txt
../LitCovid_Treatment/32698249.txt
../LitCovid_Treatment/32316270.txt
../LitCovid_Treatment/32449939.txt
../LitCovid_Treatment/32544884.txt
../LitCovid_Treatment/32669297.txt
../LitCovid_Treatment/32171193.txt
../LitCovid_Treatment/32496926.txt
../LitCovid_Treatment/32358890.txt
../LitCovid_Treatment/32661794.txt
../LitCovid_Treatment/32551464.txt
../LitCovid_Treatment/32707768.txt
../LitCovid_Treatment/32766537.txt
../LitCovid_Treatment/32388458.txt
../LitCovid_Treatment/32647807.txt
../LitCovid_Treatment/32639233.txt
../LitCovid_Treatment/32327757.txt
../LitCovid_Treatment/32641977.txt
../LitCovid_Treatment/32393419.txt
../LitCovid_Treatment/32520599.txt
../LitCovid_Treatment/32335456.txt
../LitCovid_Treatment/32718020.txt
../LitCovid_Treatment/32661393.txt
../LitCovid_Treatment/32402766.txt
../LitCovid_Treatment/32692877.txt
../LitCovid_Treatment/32646669.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32231348.txt
../LitCovid_Treatment/32456368.txt
../LitCovid_Treatment/32681756.txt
../LitCovid_Treatment/32735892.txt
../LitCovid_Treatment/32669955.txt
../LitCovid_Treatment/32405081.txt
../LitCovid_Treatment/32356626.txt
../LitCovid_Treatment/32676976.txt
../LitCovid_Treatment/32144890.txt
../LitCovid_Treatment/32665352.txt
../LitCovid_Treatment/32278175.txt
../LitCovid_Treatment/32632558.txt
../LitCovid_Treatment/32369209.txt
../LitCovid_Treatment/32051074.txt
../LitCovid_Treatment/32714335.txt
../LitCovid_Treatment/32341791.txt
../LitCovid_Treatment/32503801.txt
../LitCovid_Treatment/32340429.txt
../LitCovid_Treatment/32641681.txt
../LitCovid_Treatment/32538209.txt
../LitCovid_Treatment/32759894.txt
../LitCovid_Treatment/32615317.txt
../LitCovid_Treatment/32201449.txt
../LitCovid_Treatment/32766706.txt
../LitCovid_Treatment/32426749.txt
../LitCovid_Treatment/32725536.txt
../LitCovid_Treatment/32725595.txt
../LitCovid_Treatment/32571838.txt
../LitCovid_Treatmen

../LitCovid_Treatment/32758266.txt
../LitCovid_Treatment/32253226.txt
../LitCovid_Treatment/32725286.txt
../LitCovid_Treatment/32740433.txt
../LitCovid_Treatment/32033513.txt
../LitCovid_Treatment/32643489.txt
../LitCovid_Treatment/32696772.txt
../LitCovid_Treatment/32407491.txt
../LitCovid_Treatment/32696006.txt
../LitCovid_Treatment/32589784.txt
../LitCovid_Treatment/32761219.txt
../LitCovid_Treatment/32173241.txt
../LitCovid_Treatment/32419646.txt
../LitCovid_Treatment/32456489.txt
../LitCovid_Treatment/32224151.txt
../LitCovid_Treatment/32308266.txt
../LitCovid_Treatment/32296910.txt
../LitCovid_Treatment/32455478.txt
../LitCovid_Treatment/32149773.txt
../LitCovid_Treatment/32716893.txt
../LitCovid_Treatment/32748634.txt
../LitCovid_Treatment/32620409.txt
../LitCovid_Treatment/32562746.txt
../LitCovid_Treatment/32592163.txt
../LitCovid_Treatment/32535885.txt
../LitCovid_Treatment/32499988.txt
../LitCovid_Treatment/32496012.txt
../LitCovid_Treatment/32589164.txt
../LitCovid_Treatmen

# Extract abstracts and send to single txt file

In [44]:
#open txt file
file = "LitCovid_sent_classification_formatted.txt"
f = open(file, "w")
count = 0

#iterate through pmids 
for i in range(treatment_pmids.shape[0]):
    #get abstract text
    passages = treatment_pmids.data.iloc[i].get("passages")

    #no headers
    #text = [passages[i].get("text") for i in range(1, len(passages)) if (passages[i].get("infons").get("type")) in ["abstract", "Abstract", "ABSTRACT"]]
    
    #get title
    if passages[0].get("infons").get("section") in ["title", "Title", "TITLE"]:
        title = passages[0].get("text")
    else:
        raise ValueError('not itle.')
    
    #headers
    text = [passages[i].get("text") for i in range(1, len(passages)) if (passages[i].get("infons").get("section").lower()) in ["abstract", "Abstract", "ABSTRACT"] and passages[i].get("text").lower() != "abstract"]
    
    #format section titles
    formatted = [t.upper() + ":" if len(t.split()) == 1 else t for t in text]
    
    #write to file only if data is valid
    if len(text) > 0 and text != ['']:
        formatted.insert(0, "{} |".format(title))
        formatted.insert(0, "{} |".format(treatment_pmids.pmid.iloc[i]))
        f.write(" ".join(formatted) + "\n")
        count +=1 

f.close()
print(count)

6469


# Manually combing through data

In [36]:
row = treatment_pmids[treatment_pmids.pmid == 32361738 ].index[0]
passages = treatment_pmids.data.loc[row].get("passages")
text = [passages[i].get("text") for i in range(1, len(passages)) if (passages[i].get("infons").get("section")) in ["abstract", "Abstract", "ABSTRACT"]]
passages

[{'infons': {'name_3': 'surname:Gao;given-names:Zitong',
   'name_2': 'surname:Fu;given-names:Yuanyuan',
   'name_1': 'surname:Chen;given-names:Shaoqiu',
   'article-id_pmid': '32361738',
   'name_0': 'surname:Hu;given-names:Ling',
   'year': '2020',
   'article-id_pmc': '7197620',
   'article-id_publisher-id': 'ciaa539',
   'article-id_doi': '10.1093/cid/ciaa539',
   'type': 'front',
   'section_type': 'TITLE',
   'journal': 'Clin. Infect. Dis.; 2020 May 03 . doi:10.1093/cid/ciaa539',
   'elocation-id': 'ciaa539',
   'license': 'This article is published and distributed under the terms of the Oxford University Press, Standard Journals Publication Model (https://academic.oup.com/journals/pages/open_access/funder_policies/chorus/standard_publication_model) This article is made available via the PMC Open Access Subset for unrestricted re-use and analyses in any form or by any means with acknowledgement of the original source. These permissions are granted for the duration of the COVID-19

In [23]:
passages[0].get("infons").get("section") in ["title", "Title", "TITLE"]

True

In [70]:
row = treatment_pmids[treatment_pmids.pmid == 32342252].index[0]
passages = treatment_pmids.data.loc[row].get("passages")
[passages[i].get("text") for i in range(1, len(passages)) if (passages[i].get("infons").get("section") == "Abstract")]


['In a preliminary clinical study, we observed that the combination of hydroxychloroquine and azithromycin was effective against SARS-CoV-2 by shortening the duration of viral load in Covid-19 patients. It is of paramount importance to define when a treated patient can be considered as no longer contagious. Correlation between successful isolation of virus in cell culture and Ct value of quantitative RT-PCR targeting E gene suggests that patients with Ct above 33-34 using our RT-PCR system are not contagious and thus can be discharged from hospital care or strict confinement for non-hospitalized patients.']

In [47]:
row = treatment_pmids[treatment_pmids.pmid == 32594204].index[0]
passages = treatment_pmids.data.loc[row].get("passages")
[passages[i].get("text") for i in range(1, len(passages)) if len(passages[i].get("text").split()) >1]
passages

[{'infons': {'journal': 'Eur. J. Clin. Pharmacol.; 2020 Jun 27 . doi:10.1007/s00228-020-02947-4',
   'year': '2020',
   'article-id_pmc': 'PMC7320911',
   'type': 'title',
   'authors': 'Ferrara F, Granata G, Pelliccia C, La Porta R, Vitiello A, ',
   'section': 'Title'},
  'offset': 0,
  'text': 'The added value of pirfenidone to fight inflammation and fibrotic state induced by SARS-CoV-2 : Anti-inflammatory and anti-fibrotic therapy could solve the lung complications of the infection?'},
 {'infons': {'type': 'abstract', 'section': 'Abstract'},
  'offset': 193,
  'text': 'AIM: SARS-CoV-2 infection has been divided by scientific opinion into three phases: the first as asymptomatic or slightly symptomatic and the second and the third with greater severity, characterized by a hyperinflammatory and fibrotic state, responsible for lung lesions, in some cases fatal. The development of antiviral drugs directed against SARS-CoV-2 and effective vaccines is progressing; meanwhile, the best phar

In [99]:
"tea tea".split()

['tea', 'tea']

In [135]:
count = 0
folder = "../LitCovid_Treatment/"
for i in range(treatment_pmids.shape[0]):
# for i in range(2):
    name = folder + str(treatment_pmids.pmid.iloc[i]) + ".txt"
    
    passages = treatment_pmids.data.iloc[i].get("passages")
    text = [passages[i].get("text") for i in range(1, len(passages)) if (passages[i].get("infons").get("type")) in ["abstract", "Abstract", "ABSTRACT"]]
    if len(text) > 0 and text != ['']:
        count+=1

In [1]:
[['COVID-19', "SARS-CoV-2", "SARS-CoV-2 infection", "COVID-19 infection", 
  "coronavirus disease 2019 ( COVID-19 )", 'Covid-19', 'laboratory-confirmed COVID-19', 
  "coronavirus disease 2019", "COVID-19 disease", "coronavirus disease", "SARS-CoV", 
  "COVID-19 positive", "SARS-COV-2", "SARS-CoV-2 infections",  "Covid-19 infection", 
  "COVID-19 infected", "COVID-19-positive", "SARS-CoV-2 infected", "SARS-CoV-2-infected",
  "2019-nCoV", "COVID-19 infections", "COVID 19", "COVID19", "Covid-19 infection",
  "severe acute respiratory syndrome coronavirus 2 ( SARS-CoV-2 ) infection", 
  'coronavirus disease ( COVID-19 )'],
 ["severe and critical COVID-19", "severe COVID-19 infection", 
  'severe coronavirus disease 2019 ( COVID-19 )',
  'severe coronavirus disease 2019 ( COVID-19 )', 
  'severe acute respiratory syndrome ( SARS )', 'severe and critical COVID-19',
  'severe acute respiratory syndrome coronavirus 2'],
 ["severe COVID-19 pneumonia", "severe pneumonia"],
 ["male", "men", "males", "male sex"],
 ["infection", "infected"],
 ["women", "female", "females", "female sex"],
 ["hospitalized", "critically ill", "severe disease", "critical", "severe cases",
  "critical illness", "severely ill"],
 ["comorbidity", "comorbidities"],
 ["ICU", "intensive care unit ( ICU )", "intensive care"],
 ["diabetes", "diabetes mellitus", "diabetic"],
 ["elderly", "older age", "older adults"]
 ["hydroxychloroquine", "chloroquine"]
 ["controls", "control"]
 ["corticosteroid", "corticosteroids"]
 ["vaccine", "vaccines"]
 ["placebo", "placebo"]

['.', ',', '(', 'with']

In [4]:
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\janep\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where'

In [33]:
from nltk.corpus import wordnet 
synonyms = [] 
antonyms = [] 
  
for syn in wordnet.synsets("critical"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 

print(set(synonyms)) 
print(set(antonyms)) 

{'vital', 'decisive', 'critical'}
{'uncritical', 'noncritical'}


In [28]:
from pattern.text.en import singularize
singularize("male") 

'male'

In [20]:
! pip install pattern

Collecting pattern
  Downloading https://files.pythonhosted.org/packages/1e/07/b0e61b6c818ed4b6145fe01d1c341223aa6cfbc3928538ad1f2b890924a3/Pattern-3.6.0.tar.gz (22.2MB)
Collecting future (from pattern)
  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)
Collecting backports.csv (from pattern)
  Downloading https://files.pythonhosted.org/packages/8e/26/a6bd68f13e0f38fbb643d6e497fc3462be83a0b6c4d43425c78bb51a7291/backports.csv-1.0.7-py2.py3-none-any.whl
Collecting mysqlclient (from pattern)
  Downloading https://files.pythonhosted.org/packages/4a/28/b08ede6cc8382179201455c3b9e5ed0d66aa5921e7c1888828dba48b832b/mysqlclient-2.0.1-cp37-cp37m-win_amd64.whl (268kB)
Collecting feedparser (from pattern)
  Downloading https://files.pythonhosted.org/packages/2c/84/df6de99cba01afc82344c9cb3a79df100a00ac33396120f8aa66c72f0d84/feedparser-6.0.1-py2.py3-none-any.whl (80kB)
Collecting pdfminer.six (from pa

twisted 18.7.0 requires PyHamcrest>=1.9.0, which is not installed.
You are using pip version 10.0.1, however version 20.2.4 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.
