# Create Training datasets
The process for creating the initial training datasets for the broad topicCategories NOT included in litcovid are as follows:
1. Brainstorm keywords that should be relatively SPECIFIC for that topicCategory
2. Perform queries to the APIs to retrieve all ids returned by each topicCategory-specific keyword
3. Retrieve all keywords for each id
4. Create frequency tables of all keywords
5. Take top ~5% of most frequent keywords
6. Take top 100 most frequent keywords from litcovid topics
8. Use the keywords to remove more generic terms like 'COVID19'
9. manually inspect for specific-enough terms for inclusion into original keyword list
10. Repeat process, adding keywords to grow the training dataset, or removing keywords which are causing the dataset to become less specific

## Create Dataset based on initial curated keyword lists

In [1]:
import os
import requests
import json
import pandas as pd
from pandas import read_csv
from datetime import datetime
#import pickle
#import sklearn
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score


In [2]:
DATAPATH = 'data/'
KEYPATH = os.path.join(DATAPATH,'keywords/')
KEYFILES = os.listdir(KEYPATH)
RESULTSPATH = 'results/'

In [None]:
## 'https://api.outbreak.info/resources/resource/query?q=loneliness&filter=@type:Publication&fields=_id'

### Functions

In [3]:
#### Pull ids from a json file
def get_ids_from_json(jsonfile):
    idlist = []
    for eachhit in jsonfile["hits"]:
        if eachhit["_id"] not in idlist:
            idlist.append(eachhit["_id"])
    return(idlist)

def fetch_query_size(query):
    pubmeta = requests.get('https://api.outbreak.info/resources/query?q="'+query+'"&size=0&aggs=@type')
    pubjson = json.loads(pubmeta.text)
    pubcount = int(pubjson["facets"]["@type"]["total"])
    return(pubcount)

#### Ping the API and get all the ids for a specific source and scroll through the source until number of ids matches meta
def get_query_ids(query):
    query_size = fetch_query_size(query)
    r = requests.get('https://api.outbreak.info/resources/query?q="'+query+'"&filter=@type:Publication&fields=_id&fetch_all=true')
    response = json.loads(r.text)
    idlist = get_ids_from_json(response)
    try:
        scroll_id = response["_scroll_id"]
        while len(idlist) < query_size:
            r2 = requests.get('https://api.outbreak.info/resources/query?q="'+query+'"&filter=@type:Publication&fields=_id&fetch_all=true&scroll_id='+scroll_id)
            response2 = json.loads(r2.text)
            idlist2 = set(get_ids_from_json(response2))
            tmpset = set(idlist)
            idlist = tmpset.union(idlist2)
            try:
                scroll_id = response2["_scroll_id"]
            except:
                print("no new scroll id")
        return(idlist)
    except:
        return(idlist)


def batch_fetch_kw(idlist):
    ## Break the list of ids into smaller chunks so the API doesn't fail the post request
    runs = round((len(idlist))/100,0)
    i=0 
    separator = ','
    ## Create dummy dataframe to store the meta data
    keywordsdf = pd.DataFrame(columns = ['_id','keywords'])
    while i < runs+1:
        if len(idlist)<100:
            sample = idlist
        elif i == 0:
            sample = idlist[i:(i+1)*100]
        elif i == runs:
            sample = idlist[i*100:len(idlist)]
        else:
            sample = idlist[i*100:(i+1)*100]
        sample_ids = separator.join(sample)
        r = requests.post("https://api.outbreak.info/resources/query/", params = {'q': sample_ids, 'scopes': '_id', 'fields': 'keywords'})
        if r.status_code == 200:
            rawresult = pd.read_json(r.text)
            cleanresult = rawresult[['_id','keywords']].loc[rawresult['_score']==1].copy()
            cleanresult.drop_duplicates(subset='_id',keep="first", inplace=True)
            keywordsdf = pd.concat((keywordsdf,cleanresult))
        i=i+1
    return(keywordsdf)




In [4]:
def load_search_terms():
    keyword_dict = {}
    for eachfile in KEYFILES:
        filename = eachfile.split('.')[0]
        keywords = []
        with open(os.path.join(KEYPATH,eachfile),'r') as readfile:
            for eachline in readfile:
                keywords.append(eachline.strip())
        keyword_dict[filename]=keywords
    return(keyword_dict)


def load_category_ids():
    keyword_dict = load_search_terms()
    cat_dict = {}
    for category in keyword_dict.keys():
        allids = []
        keywordlist = keyword_dict[category]
        for eachkey in keywordlist:
            idlist = get_query_ids(eachkey)
            allids = list(set(allids).union(set(idlist)))
        cat_dict[category]=allids
    return(cat_dict)


def load_litcovid_ids():
    litcovid = read_csv(os.path.join(DATAPATH,'litcovidtopics.tsv'),delimiter='\t',header=0,index_col=0)
    topics = litcovid['topicCategory'].unique().tolist()
    litcovid_dict={}
    for eachtopic in topics:
        tmplist = litcovid['_id'].loc[litcovid['topicCategory']==eachtopic].tolist()
        litcovid_dict[eachtopic]=tmplist
    return(litcovid_dict)

In [5]:
#### Obtain the most frequent (ie- generic) keywordss from litcovidtopics
#### Default is top 100 most generic, but this can be adjusted
def get_generic_terms(top=100):
    litcovid = read_csv(os.path.join(DATAPATH,'litcovidtopics.tsv'),delimiter='\t',header=0,index_col=0)
    idlist = litcovid['_id'].unique().tolist()
    tmpdf = batch_fetch_kw(idlist)
    kwdf = tmpdf.set_index('_id').keywords.apply(pd.Series).stack().reset_index(level=-1, drop=True).astype(str).reset_index()
    kwdf.rename(columns={0:'keywords'},inplace=True)
    kwdf['keywords']=kwdf['keywords'].astype(str).str.lower()
    kwfreq = kwdf.groupby('keywords').size().reset_index(name='count')
    kwfreq.sort_values('count',ascending=False,inplace=True)
    return(kwfreq.head(n=top).copy())


def get_kw_freq(topicCategory):
    cat_dict = load_category_ids()
    litcovid_dict = load_litcovid_ids()
    if topicCategory in cat_dict.keys():
        idlist = cat_dict[topicCategory]
    else:
        idlist = litcovid_dict[topicCategory]
    tmpdf = batch_fetch_kw(idlist)
    kwdf = tmpdf.set_index('_id').keywords.apply(pd.Series).stack().reset_index(level=-1, drop=True).astype(str).reset_index()
    kwdf.rename(columns={0:'keywords'},inplace=True)
    kwdf['keywords']=kwdf['keywords'].astype(str).str.lower()
    kwfreq = kwdf.groupby('keywords').size().reset_index(name='count')
    kwfreq.sort_values('count',ascending=False,inplace=True)
    return(kwfreq)


#### Set the count of the generic keywords to -1, and re-sort the list
def negate_genkw(kwfreq,genkw):
    genkwlist = genkw['keywords'].tolist()
    kwfreq['count'].loc[kwfreq['keywords'].isin(genkwlist)]=-1
    kwfreq.sort_values('count',ascending=False,inplace=True)
    return(kwfreq)



#### Export the most frequent x% of keywords
#### Note that 5(%) is default, but any number can be used
#### Also, the topicCategory is required for filename purposes
def export_most_freqkw(kwfreq,topicCategory,toppercent=5):
    export_num = int(round(len(kwfreq)*toppercent/100,0))
    exportdf = kwfreq.head(n=export_num)
    filepath = RESULTSPATH+'wordfrequencies/'
    filename = topicCategory+'_'+str(toppercent)+'_percent_most_freq_kw.txt'
    exportdf.to_csv(os.path.join(filepath,filename),sep='\t',header=True)

In [None]:
print(len(cat_dict['Behavioral Research']))
print(len(cat_dict['Environment']))


In [6]:
#### Main Script
genkw = get_generic_terms()
cat_dict = load_category_ids()
for everytopic in cat_dict.keys():
    kwfreq = get_kw_freq(everytopic)
    kwfreq = negate_genkw(kwfreq,genkw)
    export_most_freqkw(kwfreq,everytopic)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


## Manual inspection
This section contains script that was run for the purposes of manually inspecting the data in order to determine suitable default cut off points and threshholds.

### How to determine if a keyword is suitable for inclusion on a topicCategory-specific list
1. Verify that a more generic version of the word is not already available: ie- if the keyword is 'manic depression', but the term 'depression' is already on the list, it should not be necessary to add 'manic depression'
2. If a more generic version of the word is not already available, perform a litcovid search for the term and scan the titles and abstracts. The top 10 most relevant results should be within topic, and 4/5 of the least relevant results should also be within topic.
3. If criteria 2 is met, check that 12/20 of the most recent results are also within topic. This is to help account for the creation of off-topic new terms/phrases which may enclose the keyword, or changes in connotations based on trends.
4. Also search the term on outbreak.info/resources to see if appropriate results pop up
5. If criteria 1-4 are met, add the term to the list


In [None]:
#### To do, expand on keywordlist pulling the keywords for each id, generate frequency list
#### Verify that most common conceptually related keywords are in the list

## fetch the keywords for every pmid in the environment list
environmentdf = batch_fetch_kw(cat_dict['Environment'])

## unstack the dataframe and lower case the keywords for consistency
envkwdf = environmentdf.set_index('_id').keywords.apply(pd.Series).stack().reset_index(level=-1, drop=True).astype(str).reset_index()
envkwdf.rename(columns={0:'keywords'},inplace=True)
envkwdf['keywords']=envkwdf['keywords'].astype(str).str.lower()

## Determine keyword frequency
kwfreq = envkwdf.groupby('keywords').size().reset_index(name='count')
kwfreq.sort_values('count',ascending=False,inplace=True)
print(len(kwfreq))

#### Inspect number of words within various frequency categories
## Identify length of keywords short enough for review
min20 = kwfreq.loc[kwfreq['count']>20]
min10 = kwfreq.loc[kwfreq['count']>10]
min5 = kwfreq.loc[kwfreq['count']>5]
min3 = kwfreq.loc[kwfreq['count']>3]
min2 = kwfreq.loc[kwfreq['count']>2]
print(len(min20),len(min10),len(min5),len(min3),len(min2))

#### export results and inspect for potential keywords that are specific enough to this topicCategory
## Do not use words that may be frequently used outside of this topicCategory
## Eg- Behavior: 'stress' is too generic, may refer to oxidative stress
## Eg- Environment: 'climate' is too generic, may refer to current trends or the work environment
## In this case, we're going with words that were mentioned by at least 3 pmids (ie- ~top 5% most frequent words)
min3.to_csv(os.path.join(RESULTSPATH,'wordfrequencies/environment_min3.txt'),sep='\t',header=True)

In [None]:
## fetch the keywords for every pmid in the environment list
behavedf = batch_fetch_kw(cat_dict['Behavioral Research'])

## unstack the dataframe and lower case the keywords for consistency
bhkwdf = behavedf.set_index('_id').keywords.apply(pd.Series).stack().reset_index(level=-1, drop=True).astype(str).reset_index()
bhkwdf.rename(columns={0:'keywords'},inplace=True)
bhkwdf['keywords']=bhkwdf['keywords'].astype(str).str.lower()

## Determine keyword frequency
kwfreq = bhkwdf.groupby('keywords').size().reset_index(name='count')
kwfreq.sort_values('count',ascending=False,inplace=True)

print(len(kwfreq))

#### Inspect number of words within various frequency categories
## Identify length of keywords short enough for review
min20 = kwfreq.loc[kwfreq['count']>20]
min10 = kwfreq.loc[kwfreq['count']>10]
min7 = kwfreq.loc[kwfreq['count']>7]
min5 = kwfreq.loc[kwfreq['count']>5]
min3 = kwfreq.loc[kwfreq['count']>3]
min2 = kwfreq.loc[kwfreq['count']>2]
print(len(min20),len(min10),len(min7),len(min5),len(min3),len(min2))

#### export results and inspect for potential keywords that are specific enough to this topicCategory
## Do not use words that may be frequently used outside of this topicCategory
## Eg- Behavior: 'stress' is too generic, may refer to oxidative stress
## Eg- Environment: 'climate' is too generic, may refer to current trends or the work environment
## In this case, we're going with words that were mentioned by at least 7 pmids (ie- ~top 5% most frequent words)
min7.to_csv(os.path.join(RESULTSPATH,'wordfrequencies/behavior_min7.txt'),sep='\t',header=True)