# Variants, lineages, and mutations set creation
Variants, lineages, and mutations are of interest but are not necessarily confined to any specific topicCategory. Classifying for these might help to search for VOCs and VOIs.

User regex to find specific lineages, or mutations

Regex cheatcode:
* '\b' indicates word boundary
* '()' indicates a capture group
* '?:' indicates a non-capture group
* '\d' indicates a digit
* '{1,5}' indicates repeat in pattern between 1 and 5 times (so in mutation example, one to five digits)
* "r' " indicates raw string notation
* (?i)(?-i) indicates everything between is case insensitive

In [1]:
import os
import requests
import json
import pandas as pd
from pandas import read_csv
from datetime import datetime


In [2]:
DATAPATH = 'data/'
RESULTSPATH = 'results/'

In [3]:
#### Load all litcovid into a dataframe, and clean up the text for searching
import pandas as pd
import requests
from datetime import datetime

def get_ids_from_json(jsonfile):
    idlist = []
    for eachhit in jsonfile["hits"]:
        if eachhit["_id"] not in idlist:
            idlist.append(eachhit["_id"])
    return(idlist)

def batch_fetch_meta(idlist):
    ## Break the list of ids into smaller chunks so the API doesn't fail the post request
    runs = round((len(idlist))/100,0)
    i=0 
    separator = ','
    ## Create dummy dataframe to store the meta data
    textdf = pd.DataFrame(columns = ['_id','abstract','name','description','date'])
    while i < runs+1:
        if len(idlist)<100:
            sample = idlist
        elif i == 0:
            sample = idlist[i:(i+1)*100]
        elif i == runs:
            sample = idlist[i*100:len(idlist)]
        else:
            sample = idlist[i*100:(i+1)*100]
        sample_ids = separator.join(sample)
        ## Get the text-based metadata (abstract, title) and save it
        r = requests.post("https://api.outbreak.info/resources/query/", params = {'q': sample_ids, 'scopes': '_id', 'fields': 'name,abstract,description,date'})
        if r.status_code == 200:
            rawresult = pd.read_json(r.text)
            checkcols = rawresult.columns
            if (('description' not in checkcols) and ('abstract' in checkcols)):
                rawresult['description']=" "
            elif (('description' in checkcols) and ('abstract' not in checkcols)):
                rawresult['abstract']=" "
            elif (('description' not in checkcols) and ('abstract' not in checkcols)):
                rawresult['abstract']=" "
                rawresult['description']=" "
            cleanresult = rawresult[['_id','name','abstract','description','date']].loc[rawresult['_score']==1].fillna(" ").copy()
            cleanresult.drop_duplicates(subset='_id',keep="first", inplace=True)
            textdf = pd.concat((textdf,cleanresult))
        i=i+1
    return(textdf)


def merge_texts(df):
    df.fillna('',inplace=True)
    df['text'] = df['name'].astype(str).str.cat(df['abstract'].astype(str).str.cat(df['description'],sep=' '),sep=' ')
    return(df)


def clean_texts(df):
    df.fillna('',inplace=True)
    df['cleantext'] = df['text']
    df['cleantext'] = df['cleantext'].str.replace(r'\W', ' ')
    df['cleantext'] = df['cleantext'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    df['cleantext'] = df['cleantext'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    df['cleantext'] = df['cleantext'].str.lower()   
    return(df)


#### Get the size of the source (to make it easy to figure out when to stop scrolling)
def fetch_src_size(source):
    pubmeta = requests.get("https://api.outbreak.info/resources/query?q=((@type:Publication) AND (curatedBy.name:"+source+"))&size=0&aggs=@type")
    pubjson = json.loads(pubmeta.text)
    pubcount = int(pubjson["facets"]["@type"]["total"])
    return(pubcount)


#### Ping the API and get all the ids for a specific source and scroll through the source until number of ids matches meta
def get_source_ids(source):
    source_size = fetch_src_size(source)
    r = requests.get("https://api.outbreak.info/resources/query?q=((@type:Publication) AND (curatedBy.name:"+source+"))&fields=_id&fetch_all=true")
    response = json.loads(r.text)
    idlist = get_ids_from_json(response)
    try:
        scroll_id = response["_scroll_id"]
        while len(idlist) < source_size:
            r2 = requests.get("https://api.outbreak.info/resources/query?q=((@type:Publication) AND (curatedBy.name:"+source+"))&fields=_id&fetch_all=true&scroll_id="+scroll_id)
            response2 = json.loads(r2.text)
            idlist2 = set(get_ids_from_json(response2))
            tmpset = set(idlist)
            idlist = tmpset.union(idlist2)
            try:
                scroll_id = response2["_scroll_id"]
            except:
                print("no new scroll id")
        return(idlist)
    except:
        return(idlist)


def get_pub_ids(sourceset):
    pub_srcs = {"preprint":["bioRxiv","medRxiv"],"litcovid":["litcovid"],
                "other":["Figshare","Zenodo","MRC Centre for Global Infectious Disease Analysis"],
                "all":["Figshare","Zenodo","MRC Centre for Global Infectious Disease Analysis",
                       "bioRxiv","medRxiv","litcovid"]}
    sourcelist = pub_srcs[sourceset]
    allids = []
    for eachsource in sourcelist:
        sourceids = get_source_ids(eachsource)
        allids = list(set(allids).union(set(sourceids)))
    return(allids)

In [94]:
starttime = datetime.now()
allids = get_pub_ids('all')
print('fetched all ids: ',datetime.now()-starttime)
starttime = datetime.now()
metadf = batch_fetch_meta(allids)
print('fetched all metadata: ',datetime.now()-starttime)
starttime = datetime.now()
textdf = merge_texts(metadf)
print('merged all text: ',datetime.now()-starttime)
print(textdf.head(n=2))

fetched all ids:  0:04:23.295269
fetched all metadata:  0:23:11.731127
merged all text:  0:00:00.660896
            _id                                           abstract  \
0  pmid33588991  Before the coronavirus 2019 (COVID-19) pandemi...   
1  pmid33778893                                                      

                                                name description  \
0  Antimicrobial resistance and COVID-19: Interse...               
1  Screening and Diagnostic Mammography Utilizati...               

                  date                                               text  
0  2021-02-18 00:00:00  Antimicrobial resistance and COVID-19: Interse...  
1  2021-03-29 00:00:00  Screening and Diagnostic Mammography Utilizati...  


In [95]:
textdf.to_csv('data/textdf.txt',sep='\t',header=True)

In [81]:
textdf = read_csv('data/textdf.txt',delimiter='\t',header=0,index_col=0)

### To do
1. Find and extract mutations
2. Check frequency of mutations in publications to see if trends can be identified
3. Train algorithms to see if any new true positives can be identified
3. Do the same for lineages

In [5]:
#### Mutation training set creation
import re

genes = "(?:ORF1a|ORF1b|S|ORF3a|ORF3b|E|M|ORF6|ORF7a|ORF7b|ORF8|N|ORF9b|ORF10|ORF14|3'UTR|3UTR)"
#proteins = "S|E|N|M|(NSP|Nsp(?:1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16))"
geneprots = r"\b(?:ORF1a|ORF1b|Spike|spike|ORF3a|ORF3b|Envelope|envelope|M protein|M\(pro\)|ORF6|ORF7a|ORF7b|ORF8|ORF9b|ORF10|ORF14|3'UTR|3UTR|(?:(?:NSP|nsp|Nsp|N)(?:1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16)))\b"
mutations = "((?:A|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y)\d{1,5}(?:A|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y))"
#mutations = "^(?:A|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y)\d{1,5}(?:A|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y)$"

#deletions = genes+":"+"(?:DEL|Del|del)"+"\d{}"
#deletion_ex = "ORF8∆381, ORF7a∆81 and spike∆15, ∆15, 60/70-deletion"
#deletion_variant = r"\b(∆\d{1,5} variant)\b"

token_dict = {
    'mutants':r'\b((?:A|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y)\d{2,5}(?:A|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y))\b',
    #'genemute':r"\b((?:ORF1a|ORF1b|S|ORF3a|ORF3b|E|M|ORF6|ORF7a|ORF7b|ORF8|N|ORF9b|ORF10|ORF14|3'UTR|3UTR):(?:A|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y)\d{1,5}(?:A|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y))\b",
    'genemute':r"\b((?:ORF1a|ORF1b|S|Spike|spike|ORF3a|ORF3b|E|Envelope|envelope|M|M protein|M\(pro\)|ORF6|ORF7a|ORF7b|ORF8|ORF9b|ORF10|ORF14|3'UTR|3UTR|(?:(?:NSP|nsp|Nsp|N)(?:1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16)))(?:\s|:)(?:A|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y)\d{1,5}(?:A|C|D|E|F|G|H|I|K|L|M|N|P|Q|R|S|T|V|W|Y))\b",
    'deletions':r"\b((?:ORF1a|ORF1b|S|Spike|spike|ORF3a|ORF3b|E|Envelope|envelope|M|M protein|M\(pro\)|ORF6|ORF7a|ORF7b|ORF8|ORF9b|ORF10|ORF14|3'UTR|3UTR|(?:(?:NSP|nsp|Nsp|N)(?:1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16)))(?:∆|(?:DEL|Del|del|:DEL|:Del|:del))\d{1,5})\b",
    'nonspec_deletion':r"\b((?:ORF1a|ORF1b|S|Spike|spike|ORF3a|ORF3b|E|Envelope|envelope|M|M protein|M\(pro\)|ORF6|ORF7a|ORF7b|ORF8|N|ORF9b|ORF10|ORF14|3'UTR|3UTR|(?:NSP|Nsp|nsp)(?:1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16))\s(?:(?:\d{1,5})(?:\/)(?:\d{1,5})|(?:\d{1,5}))(?:\s|-)(?:deletion))\b"
}

testtext = "This gene is full of ORF1a∆123 variants or other kinds of NSP deletion variants. There are also various mutations like S:E484K which is also known just as E484K which is a spike gene mutation. There are also deletions on the spike protein such as S:DEL15 or S∆15 though it could also be S:del15 so to speak or spike:del15 or Sdel15. Imagine if there was an ORF1a:A645T or E:C163G mutation. There are also nonspecific deletions like 60/70-deletion or 145/162-deletion or ∆15. There can also be mutations or deletions in the NSP genes or proteins such as Nsp2∆115 or NSP1:del115. Is it b.1.91 going to work or will any sort of lineageb.1.91 work nope and the spike 60/70-deletion or maybe the NSP2 459 deletion"

In [49]:
def lowerlist(x):
    lowerlist = []
    for y in x:
        entry = y.lower()
        lowerlist.append(entry)
    cleanlist = list(set(lowerlist))
    return(cleanlist)


In [51]:
mutationslist = pd.DataFrame(columns=['_id','name','abstract','description','text','date','mutations'])
for eachkey in token_dict.keys():
    tmpdf = textdf.loc[textdf['text'].str.contains(token_dict[eachkey])].copy()
    tmpdf['mutations'] = tmpdf['text'].str.findall(token_dict[eachkey])
    tmpmutationslist = tmpdf.explode('mutations').copy()
    mutationslist = pd.concat((mutationslist,tmpmutationslist),ignore_index=True)
mutationslist['date'] = pd.to_datetime(mutationslist['date'])
mutationslist.drop_duplicates(keep='first',inplace=True)
mutationslist['gene_mentions'] = mutationslist['text'].str.findall(geneprots)
mutationslist['gene_mentions'] = mutationslist['gene_mentions'].apply(lambda x: lowerlist(x))
print(mutationslist.head(n=2))

  return func(self, *args, **kwargs)


                  _id                                               name  \
0   2021.04.25.441361  Conserved in 186 countries the RBD fraction of...   
1   2021.04.25.441361  Conserved in 186 countries the RBD fraction of...   
2   2021.04.25.441361  Conserved in 186 countries the RBD fraction of...   
3   2021.04.25.441361  Conserved in 186 countries the RBD fraction of...   
28       pmid33525415  Chloroquine and Hydroxychloroquine Interact Di...   
29       pmid33525415  Chloroquine and Hydroxychloroquine Interact Di...   
30  2021.04.02.438288  An emerging SARS-CoV-2 mutant evading cellular...   
31  2021.04.02.438288  An emerging SARS-CoV-2 mutant evading cellular...   
33       pmid33184173  Analysis of genomic distributions of SARS-CoV-...   
34       pmid33184173  Analysis of genomic distributions of SARS-CoV-...   
35       pmid33184173  Analysis of genomic distributions of SARS-CoV-...   
36       pmid33184173  Analysis of genomic distributions of SARS-CoV-...   
37       pmi

In [52]:
mutationsclean = mutationslist[['_id','name','date','mutations','gene_mentions']].copy()
mutationsclean.to_csv(os.path.join(RESULTSPATH,'mutations.tsv'),sep='\t',header=True)

In [91]:
humefactors = mutationslist.loc[mutationslist['text'].str.contains("polymorphism")].copy()
humefactors.drop_duplicates(keep="first",inplace=True)
humefactors.drop(columns=['abstract','text','description'],inplace=True)
humefactors.to_csv(os.path.join(RESULTSPATH,'polymorphisms.tsv'),sep='\t',header=True)

In [104]:
print(len(textdf))
#mutdf = textdf.loc[(textdf['text'].str.contains(token_dict['mutants']))].copy()
#mutdf['mutations'] = mutdf['text'].str.findall(token_dict['mutants'])

#mutdf['date'] = pd.to_datetime(mutdf['date'])
#mutationslist = mutdf.explode('mutations').copy()
#mutationslist.drop(['abstract','name','description'],axis=1,inplace=True)
#sortedmutations = mutationslist.sort_values(['mutations','date'],ascending=[True,True])
#mutationfrequency = mutationslist.groupby('mutations').size().reset_index(name='counts')
mutationfrequency = mutationslist.groupby('mutations').resample('W-Mon', on='date').size().reset_index(name='counts').sort_values(by='date')
mutationfrequency.sort_values(['date','counts'],ascending=[False,False],inplace=True)
print(mutationslist.head(n=5))
print(mutationfrequency.head(n=20))
#singlementions = mutationfrequency['mutations'].loc[mutationfrequency['counts']<2].unique().tolist()
#mutations2check = mutationslist.loc[mutationslist['mutations'].isin(singlementions)]
#mutations2check.to_csv('data/variants/mutations_to_check.tsv',sep='\t',header=True)

143248
                 _id                                               name  \
0  2021.04.25.441361  Conserved in 186 countries the RBD fraction of...   
1  2021.04.25.441361  Conserved in 186 countries the RBD fraction of...   
2  2021.04.25.441361  Conserved in 186 countries the RBD fraction of...   
3  2021.04.25.441361  Conserved in 186 countries the RBD fraction of...   
4  2021.04.25.441361  Conserved in 186 countries the RBD fraction of...   

                                            abstract description  \
0  SARS-CoV-2 developed global-pandemic with mill...               
1  SARS-CoV-2 developed global-pandemic with mill...               
2  SARS-CoV-2 developed global-pandemic with mill...               
3  SARS-CoV-2 developed global-pandemic with mill...               
4  SARS-CoV-2 developed global-pandemic with mill...               

                                                text       date mutations  
0  Conserved in 186 countries the RBD fraction of... 2021

In [93]:
#genemutdf = textdf.loc[textdf['text'].str.contains(token_dict['genemute'],case=False)].copy()
#genemutdf = textdf.loc[textdf['text'].str.extract(token_dict['genemute'])].copy()
genemutdf = textdf.loc[textdf['text'].str.contains('E484K')].copy()
print(genemutdf)

                    _id                                           abstract  \
27  2020.12.23.20248598  In this study, we report the sequencing of 180...   
28         pmid33917138  A new variant of SARS-CoV-2 B.1.351 lineage (f...   
52         pmid33707329  Here, we report the coding-complete genome seq...   
52  2021.04.27.21255987  BackgroundThe emergence of SARS-CoV-2 variants...   
2     2021.03.15.435528  Understanding the ability of SARS-CoV-2 vaccin...   
..                  ...                                                ...   
75         pmid33853970  Cases of SARS-CoV-2 infection in Manaus, Brazi...   
59    2021.04.09.439181  The novel {beta}-coronavirus has caused sad lo...   
39    2021.03.06.434059  A SARS-CoV-2 lineage designated as P.3 with mu...   
88    2021.04.21.440801  Rapid whole genome sequencing of SARS-CoV-2 ha...   
24    2021.03.24.436620  DNA sequence analysis recently identified the ...   

                                                 name descripti

In [None]:
#### Variant training set creation


"""
Variant of Concern 202012/01", "VOC-202012/01", "20B/501Y.V1", "20I/501Y.V1", the 501Y.V2 variant
Variant Under Investigation 202012/01 (VUI 202012/01 for short)
"the {location} variant"
"variant in {location}"
"""

the_variant = r"((?i)the (?:\w*|\w*.\w*) variant(?-i))"


In [84]:
#### Lineage training set creation
## More lineage names: "https://www.the-scientist.com/news-opinion/a-guide-to-emerging-sars-cov-2-variants-68387"

## fetch lineages from :"https://cov-lineages.org/lineages.html"
lineagetable = read_csv("https://raw.githubusercontent.com/cov-lineages/pango-designation/master/lineages.csv",error_bad_lines=False,header=0)
lineages = lineagetable['lineage'].loc[lineagetable['lineage'].str.len()>2].unique().tolist()
print(lineagetable,len(lineages))

                              taxon lineage
0           Guangzhou/GZMU0016/2020       A
1       Beijing/Wuhan_IME-BJ07/2020       A
2                Nanchang/JX90/2020       A
3              USA/CA-CZB-1240/2020       A
4               Nanchang/JX177/2020       A
...                             ...     ...
456347        Brazil/MG-LBI263/2021     P.1
456348        Brazil/SP-819690/2021     P.1
456349        Brazil/SP-827441/2021     P.1
456350        Brazil/SP-827466/2021     P.1
456351        Brazil/SP-827777/2021     P.1

[456352 rows x 2 columns] 1286
Empty DataFrame
Columns: [taxon, lineage]
Index: []


In [None]:
## Generic search terms for filtering
filter_terms = ["variant","voi","voc","mutant","mutation","lineage","strain","species","clade","branch"]

In [73]:
## Fetch lineages from Wikidata
querylist = [
    """
    SELECT
      ?item ?itemLabel ?itemAltLabel
    WHERE 
    {
      ?item wdt:P31 wd:Q104450895.        
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }""", 
    """
    SELECT 
      ?item ?itemLabel ?itemAltLabel
    WHERE 
    {
      ?item wdt:P279 wd:Q82069695.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    """,
    """
    SELECT 
      ?item ?itemLabel ?itemAltLabel
    WHERE 
    {
      ?item wdt:P31 wd:Q105758262.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    """    
]

def variant_names(querylist): 
    from collections import OrderedDict
    url = 'https://query.wikidata.org/sparql'
    variants = []
    for query in querylist:
        r = requests.get(url, params = {'format': 'json', 'query': query})
        data = r.json()
        for item in data['results']['bindings']:
            try:
                variants.append(OrderedDict({
                'name': item['itemLabel']['value'],
                'alias': item['itemLabel']['value']}))
                tmp= item['itemAltLabel']['value'].split(',')
                for altname in tmp:
                    if len(altname.strip())>3:
                        variants.append(OrderedDict({
                        'name': item['itemLabel']['value'],
                        'alias': altname
                        }))
            except:
                variants.append(OrderedDict({
                'name': item['itemLabel']['value'].strip(),
                'alias': item['itemLabel']['value'].strip()
                }))
    wikivariants = pd.DataFrame(variants)
    wikivariants.drop_duplicates(keep='first',inplace=True)
    return(wikivariants)

In [80]:
wikivariants = variant_names(querylist)
wikidict = {}
i=0
while i < len(wikivariants):
    wikidict[wikivariants.iloc[i]['alias'].lower().strip()] = wikivariants.iloc[i]['name'].lower().strip()
    i=i+1

## Lineage search, no additional regex formatting
The issue with this method is that the (.) are not handled properly so things like N95 end up being included and an additional filtering is needed which may end up cutting out relevant entries

In [82]:
masterlist = list(set(lineages).union(set(wikivariants['alias'].tolist())))
searchterm = ' | '.join(masterlist)
filterterms = '|'.join(filter_terms)
tmpdf = textdf.loc[textdf['text'].str.contains(searchterm)].copy()
tmpdf['lineages'] = tmpdf['text'].str.findall(searchterm)
rawlineageslist = tmpdf.explode('lineages').copy()
cleanlineageslist = rawlineageslist.loc[rawlineageslist['text'].str.contains(filterterms)].copy()
cleanlineageslist['lineages'] = [x.strip() for x in cleanlineageslist['lineages']]

print(len(rawlineageslist))
print(len(cleanlineageslist))
    
cleanlineageslist['lineages'].replace(wikidict,inplace=True)
cleanlineageslist.drop_duplicates(keep='first',inplace=True)
print(len(cleanlineageslist))
print(cleanlineageslist.head(n=10))

5240
1922
900
                    _id                                           abstract  \
85         pmid33580111  SARS-CoV-2 has spread rapidly around the world...   
85         pmid33580111  SARS-CoV-2 has spread rapidly around the world...   
29  2020.12.23.20248598  In this study, we report the sequencing of 180...   
23    2021.04.23.441186  Severe acute respiratory syndrome coronavirus ...   
30         pmid33917138  A new variant of SARS-CoV-2 B.1.351 lineage (f...   
68    2021.02.18.431484  Several SARS-CoV-2 vaccines have received EUAs...   
68    2021.02.18.431484  Several SARS-CoV-2 vaccines have received EUAs...   
52         pmid33618008  OBJECTIVES: To evaluate the genomic diversity ...   
52         pmid33618008  OBJECTIVES: To evaluate the genomic diversity ...   
52         pmid33618008  OBJECTIVES: To evaluate the genomic diversity ...   

                                                 name description  \
85  SARS-CoV-2 genomic surveillance in Rondônia, B...     

In [83]:
lineagecounts = cleanlineageslist.groupby('lineages').size().reset_index(name='publication counts')
lineagecounts.sort_values('publication counts',ascending=False,inplace=True)
print(lineagecounts)

            lineages  publication counts
98   Lineage B.1.1.7                 252
100  Lineage B.1.351                 134
122              P.1                  79
115              N95                  59
23               B.1                  26
..               ...                 ...
55         B.1.36.27                   1
54         B.1.36.10                   1
52         B.1.351)3                   1
49         B.1.243.1                   1
137              ZO1                   1

[138 rows x 2 columns]


## Additional regex attempts

In [None]:
masterlist = list(set(lineages).union(set(wikivariants['alias'].tolist())))
#print(len(masterlist))
searchterm = ' | '.join(masterlist)
#re_str1 = r'\b(?i)('
#re_str2 = r')(?-i)'
#rawstring = r"{}".format(searchterm)
#searchregex = re.compile(re_str1 + rawstring + re_str2)
filterterms = '|'.join(filter_terms)

In [148]:
masterlist = list(set(lineages).union(set(wikivariants['alias'].tolist())))
regexlist = []
for eachitem in masterlist:
    #rawstring = r"{}".format(eachitem.strip().replace('.','\.'))
    searchstring = rf"{re.escape(eachitem)}"
    #searchregex = re.compile(searchstring)
    #print(searchregex)
    regexlist.append(searchstring)
    #checkdf = textdf.loc[textdf['text'].str.contains(searchregex)].copy()
    #if len(checkdf)>0:
    #    checkdf['lineages'] = checkdf['text'].str.findall(searchregex)
    #    print(checkdf)
    

In [150]:
#searchterms = '|'.join(regexlist)
searchregex = re.compile('|'.join(regexlist), re.IGNORECASE)
lineagedf = textdf.loc[textdf['text'].str.contains(searchregex)].copy()
lineagedf['lineages'] = lineagedf['text'].str.findall(searchregex)
print(len(lineagedf))

964


In [151]:
rawlineageslist = lineagedf.explode('lineages').copy()
#cleanlineageslist = rawlineageslist.loc[rawlineageslist['text'].str.contains(filterterms)].copy()
cleanlineageslist = rawlineageslist[['_id','name','date','lineages']].copy()
cleanlineageslist['lineages'] = [x.strip().lower() for x in cleanlineageslist['lineages']]

print(len(rawlineageslist))
print(len(cleanlineageslist))
    
cleanlineageslist['lineages'].replace(wikidict,inplace=True)
cleanlineageslist.drop_duplicates(keep='first',inplace=True)
print(len(cleanlineageslist))
print(cleanlineageslist.head(n=10))
cleanlineageslist.to_csv('results/lineages.tsv',sep='\t',header=True)

2802
2802
1667
                    _id                                               name  \
68    2021.04.02.438288  An emerging SARS-CoV-2 mutant evading cellular...   
68    2021.04.02.438288  An emerging SARS-CoV-2 mutant evading cellular...   
73         pmid33788923  Estimation of secondary household attack rates...   
73         pmid33788923  Estimation of secondary household attack rates...   
85         pmid33580111  SARS-CoV-2 genomic surveillance in Rondônia, B...   
85         pmid33580111  SARS-CoV-2 genomic surveillance in Rondônia, B...   
63         pmid33147321  An unconventional view of COVID-19 T cell immu...   
97         pmid32576424  Stress, Anxiety, and Depression in People Aged...   
29  2020.12.23.20248598  Genomic characterization of a novel SARS-CoV-2...   
99         pmid33881861  Fast Prediction of Binding Affinities of the S...   

                   date         lineages  
68  2021-04-05 00:00:00          b.1.427  
68  2021-04-05 00:00:00            b.1.2

In [152]:
frequency = cleanlineageslist.groupby('lineages').size().reset_index(name='counts')
frequency.sort_values('counts',ascending=False,inplace=True)
print(frequency.head(n=10))
print(frequency.tail(n=10))

            lineages  counts
35           b.1.1.7     315
47           b.1.351     138
122              p.1     104
128              s.1      88
21             b.1.1      68
20               b.1      59
103  lineage b.1.1.7      47
49   b.1.351 variant      45
36   b.1.1.7 lineage      43
132              v.1      39
            lineages  counts
24         b.1.1.117       1
68              b.27       1
105  lineage b.1.429       1
67              b.26       1
65              b.15       1
64              b.13       1
61   b.1.617 variant       1
111              m.3       1
18              an.1       1
27         b.1.1.251       1
