# Load libraries 

In [111]:
import os
import json
import pickle
import nltk
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data 

In [5]:
# looks like in my development environment, I need \\ in between directory names for paths to work
data_path = "C:\\Users\\fyuan14\\Documents\\fcg\\kaggle_group\\CORD-19-research-challenge"
assert(os.path.exists(data_path))

In [27]:
# data is populated across 4 directories
data_dir1 = "biorxiv_medrxiv\\biorxiv_medrxiv"
data_dir2 = "comm_use_subset\\comm_use_subset"
data_dir3 = "custom_license\\custom_license"
data_dir4 = "noncomm_use_subset\\noncomm_use_subset"

all_data_dir_names = [data_dir1, data_dir2, data_dir3, data_dir4]
for a in all_data_dir_names:
    assert(os.path.exists(os.path.join(data_path, a)))

# Process data 

## Unit test for reading in data 

In [87]:
with open(test_json) as json_data:
    test_data = json.load(json_data)

In [88]:
test_data

{'paper_id': '0015023cc06b5362d332b3baf348d11567ca2fbb',
 'metadata': {'title': 'The RNA pseudoknots in foot-and-mouth disease virus are dispensable for genome replication but essential for the production of infectious virus. 2 3',
  'authors': [{'first': 'Joseph',
    'middle': ['C'],
    'last': 'Ward',
    'suffix': '',
    'affiliation': {},
    'email': ''},
   {'first': 'Lidia',
    'middle': [],
    'last': 'Lasecka-Dykes',
    'suffix': '',
    'affiliation': {},
    'email': ''},
   {'first': 'Chris',
    'middle': [],
    'last': 'Neil',
    'suffix': '',
    'affiliation': {},
    'email': ''},
   {'first': 'Oluwapelumi',
    'middle': [],
    'last': 'Adeyemi',
    'suffix': '',
    'affiliation': {},
    'email': ''},
   {'first': 'Sarah',
    'middle': [],
    'last': '',
    'suffix': '',
    'affiliation': {},
    'email': ''},
   {'first': '',
    'middle': [],
    'last': 'Gold',
    'suffix': '',
    'affiliation': {},
    'email': ''},
   {'first': 'Niall',
    'mid

In [42]:
type(test_data)

dict

In [43]:
list(test_data.keys())

['paper_id',
 'metadata',
 'abstract',
 'body_text',
 'bib_entries',
 'ref_entries',
 'back_matter']

## Actually read in data 

In [46]:
'''
Accepts argument for path to directory where JSON data is stored.
This function iterates through the directory and gets all the files. After processing a file,
it is appended to data_list.

Return: data_list
'''
def read_paper_json(path):
    data_list = []
    for entry in os.scandir(path):
        path = entry.path
        with open(path) as json_data:
            data = json.load(json_data)
            data_list.append(data)

    return data_list

In [50]:
data_list0 = read_paper_json(os.path.join(data_path, all_data_dir_names[0]))
print(len(data_list0))

data_list1 = read_paper_json(os.path.join(data_path, all_data_dir_names[1]))
print(len(data_list1))

data_list2 = read_paper_json(os.path.join(data_path, all_data_dir_names[2]))
print(len(data_list2))

data_list3 = read_paper_json(os.path.join(data_path, all_data_dir_names[3]))
print(len(data_list3))

885
9118
16959
2353


# Explore JSON entries and hopefully unpack them

In [51]:
test_data = data_list0[0]
print(list(test_data.keys()))

['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter']


In [54]:
test_data['body_text']

[{'text': 'VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structural protein-coding region is replaced by reporter genes, allow the study of genome 68 replication without the requirement for high containment (9, 10) ( figure 1A ).',
  'cite_spans': [],
  'ref_spans': [{'start': 351,
    'end': 360,
    'text': 'figure 1A',
    'ref_id': 'FIGREF50'}],
  'section': ''},
 {'text': 'The FMDV 5′ UTR is the largest known picornavirus UTR, comprising approximately 1300 71 nucleotides and containing several highly structured regions. The first 360 nucleotides at the 5′ 72 end are predicted to fold into a single large stem loop termed the S-fragment, followed by a The PKs were originally predicted in 1987 and consist of two to four tandem repeats of a ~48 86 nucleotide region containing a small stem loop and downstream interaction site (figure 1B) 87 (1

In [90]:
test_data['paper_id']

'0015023cc06b5362d332b3baf348d11567ca2fbb'

In [91]:
test_data['metadata']

{'title': 'The RNA pseudoknots in foot-and-mouth disease virus are dispensable for genome replication but essential for the production of infectious virus. 2 3',
 'authors': [{'first': 'Joseph',
   'middle': ['C'],
   'last': 'Ward',
   'suffix': '',
   'affiliation': {},
   'email': ''},
  {'first': 'Lidia',
   'middle': [],
   'last': 'Lasecka-Dykes',
   'suffix': '',
   'affiliation': {},
   'email': ''},
  {'first': 'Chris',
   'middle': [],
   'last': 'Neil',
   'suffix': '',
   'affiliation': {},
   'email': ''},
  {'first': 'Oluwapelumi',
   'middle': [],
   'last': 'Adeyemi',
   'suffix': '',
   'affiliation': {},
   'email': ''},
  {'first': 'Sarah',
   'middle': [],
   'last': '',
   'suffix': '',
   'affiliation': {},
   'email': ''},
  {'first': '',
   'middle': [],
   'last': 'Gold',
   'suffix': '',
   'affiliation': {},
   'email': ''},
  {'first': 'Niall',
   'middle': [],
   'last': 'Mclean',
   'suffix': '',
   'affiliation': {},
   'email': ''},
  {'first': 'Caroline

In [92]:
test_data['bib_entries']

{'BIBREF0': {'ref_id': 'b0',
  'title': 'Genetic economy in 598 picornaviruses: Foot-and-mouth disease virus replication exploits alternative precursor 599 cleavage pathways',
  'authors': [{'first': 'T', 'middle': [], 'last': 'Jackson', 'suffix': ''},
   {'first': 'T', 'middle': ['J'], 'last': 'Tuthill', 'suffix': ''},
   {'first': 'D', 'middle': ['J'], 'last': 'Rowlands', 'suffix': ''},
   {'first': 'N', 'middle': ['J'], 'last': 'Stonehouse', 'suffix': ''}],
  'year': 2017,
  'venue': 'PLOS Pathog',
  'volume': '13',
  'issn': '',
  'pages': '',
  'other_ids': {}},
 'BIBREF2': {'ref_id': 'b2',
  'title': 'A universal protocol to 602 generate consensus level genome sequences for foot-and-mouth disease virus and other 603 positive-sense polyadenylated RNA viruses using the Illumina MiSeq',
  'authors': [{'first': 'N',
    'middle': ['D'],
    'last': 'Sanderson',
    'suffix': ''},
   {'first': 'N', 'middle': ['J'], 'last': 'Knowles', 'suffix': ''},
   {'first': 'D', 'middle': ['P'], '

In [93]:
test_data['ref_entries']

{'FIGREF0': {'text': 'and-mouth disease virus (FMDV) is a single stranded positive sense RNA virus of the 45 genus Aphthovirus in the family Picornaviridae. It occurs as seven, antigenically diverse 46 serotypes; A, O, C, Asia 1, South African Territories (SAT) 1, 2 and 3. It is the causative agent 47 of foot-and-mouth disease (FMD), a highly contagious disease of cloven-hooved animals 48 affecting most notably cattle, pigs, sheep and goats in addition to wild species such as the 49 African buffalo. Disease outbreaks have serious economic implications resulting from trade 50 restrictions, reduced productivity and the slaughter of infected and at-risk animals (1). The 51 2001 outbreak in the UK caused economic losses of over £8 billion to the tourism and 52 agricultural sectors. Inactivated virus vaccines are used in countries in which FMD is endemic, 53 but these are often strain-specific and provide little cross protection between serotypes (2). 54 Antigenic variation together with th

In [94]:
test_data['back_matter']

[{'text': 'author/funder. All rights reserved. No reuse allowed without permission.The copyright holder for this preprint (which was not peer-reviewed) is the The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint',
  'cite_spans': [],
  'ref_spans': [],
  'section': 'annex'}]

# Explore body_text section

In [96]:
test_text = test_data['body_text']
print(test_text[0])

{'text': 'VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structural protein-coding region is replaced by reporter genes, allow the study of genome 68 replication without the requirement for high containment (9, 10) ( figure 1A ).', 'cite_spans': [], 'ref_spans': [{'start': 351, 'end': 360, 'text': 'figure 1A', 'ref_id': 'FIGREF50'}], 'section': ''}


In [98]:
print([type(t) for t in test_text])

[<class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>, <class 'dict'>]


In [99]:
print([t.keys() for t in test_text])

[dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section']), dict_keys(['text', 'cite_spans', 'ref_spans', 'section'

In [105]:
'''
Combine disparate text sections of one JSON entry into unified text.
- body_text: list of JSON dictionaries; should represent 'body_text' section of JSON

Returns string of full paper text
'''
def get_paper_full_text(body_text):
    full_text = ''
    for section in body_text:
        text_to_add = section['text']
        full_text += text_to_add
    
    return full_text

In [106]:
paper_text0 = get_paper_full_text(test_text)

In [107]:
paper_text0

'VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structural protein-coding region is replaced by reporter genes, allow the study of genome 68 replication without the requirement for high containment (9, 10) ( figure 1A ).The FMDV 5′ UTR is the largest known picornavirus UTR, comprising approximately 1300 71 nucleotides and containing several highly structured regions. The first 360 nucleotides at the 5′ 72 end are predicted to fold into a single large stem loop termed the S-fragment, followed by a The PKs were originally predicted in 1987 and consist of two to four tandem repeats of a ~48 86 nucleotide region containing a small stem loop and downstream interaction site (figure 1B) 87 (12). Due to the sequence similarity between the PKs (figure 1C), it is speculated that they 88 were formed by duplication events during viral replication, probably

# Get tf-idf of paper text 

In [115]:
'''
Get full text of every paper in list of JSON paper entries and combine that text into a list.
'''
def get_all_full_text(entry_list):
    text_list = []
    for entry in entry_list:
        body_text = entry['body_text']
        full_text = get_paper_full_text(body_text)
        text_list.append(full_text)
    assert(len(entry_list) == len(text_list))

    return text_list

In [116]:
# get full text of every paper and combine into list

# list 0
list0_text_list = get_all_full_text(data_list0)

# list 1
list1_text_list = get_all_full_text(data_list1)

# list 2
list2_text_list = get_all_full_text(data_list2)

# list 3
list3_text_list = get_all_full_text(data_list3)

In [117]:
# vectorize text
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
from sklearn.feature_extraction.text import TfidfVectorizer

In [125]:
vectorizer = TfidfVectorizer(stop_words='english')
X_list0 = vectorizer.fit_transform(list0_text_list)
print(vectorizer.get_feature_names())






In [119]:
print(X_list0.shape)

(885, 55702)


In [None]:
'''
Following few cells are derived from 
https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
'''

In [126]:
feature_names = vectorizer.get_feature_names()
doc = 0
feat_idx = X_list0[doc, :].nonzero()[1]
tfidf_scores = zip(feat_idx, [X_list0[doc, x] for x in feat_idx])

for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    print(w, s)

vp3 0.021449889613240413
vp0 0.024631603589715615
processed 0.010217595866981217
vp2 0.019408868153730913
vp4 0.019408868153730913
virus 0.033740656171259015
assembly 0.021185147936894418
p2 0.01700199444495607
64 0.00881879085557694
p3 0.017874654836615534
regions 0.014039031106775176
encode 0.012879698404874398
non 0.005356327290654035
structural 0.017674229656502123
proteins 0.007314732475779156
2b 0.01570395740026562
2c 0.008874055440088423
3a 0.007572975615191563
3b 0.008125309437265566
vpg 0.021449889613240413
3c 0.00961655123793774
pro 0.023305953110921856
protein 0.006105974813528611
coding 0.010300605033361304
region 0.03456167337616826
replaced 0.011779259008971373
reporter 0.012651919400630832
genes 0.008341438682268056
allow 0.008623362138514633
study 0.008463167264277288
genome 0.007152658435764862
68 0.00912425228243547
replication 0.15213532822958198
requirement 0.025527883054122128
high 0.0044791748094111785
containment 0.012651919400630832
10 0.09048603107027313
figure

poor 0.020887043394381667
reads 0.03253566732502151
filtered 0.011652976555460928
225 0.012093107246894707
sickle 0.022224729880940777
algorithm 0.01990882098794493
host 0.014082709046520163
cell 0.012161927061248976
screen 0.012879698404874398
226 0.012000504832052143
assembled 0.023558518017942745
novo 0.013596177891011333
contigs 0.031722603855980754
idba 0.022224729880940777
ud 0.023223672726110683
35 0.006551763620543295
matched 0.012597235045841428
227 0.012000504832052143
library 0.011571269614771593
basic 0.008675579434753585
local 0.0078658405339655
alighnment 0.024631603589715615
search 0.010623167864714086
tool 0.009929156675288143
blast 0.012651919400630832
228 0.011910307948835882
consensus 0.011652976555460928
seqman 0.022224729880940777
software 0.007769950048713297
star 0.03293344794602121
lasergene 0.022224729880940777
13 0.004650950654517766
229 0.012046497355610605
package 0.009805571219787998
36 0.006737392374101515
data 0.00801008687605453
largely 0.019466955194036

In [127]:
list0_df = pd.DataFrame(X_list0.toarray(), columns=vectorizer.get_feature_names())

# See if we can represent tf-idf information in a meaningful way 

In [128]:
list0_df.head()

Unnamed: 0,00,000,0000,000000,000001,00001,000011,000013621,00002,00006181,...,௩is,ṡrepresent,ẋwhere,ℓ1,九省通衢,感冒,新型冠状病毒,春运,武汉爆发,武汉肺
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.017851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.00538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [132]:
sorted(list0_df.columns)

In [130]:
list0_df.shape

(885, 55409)

In [133]:
max(list0_df['aches'])

0.027320801427369715

In [134]:
max(list0_df['acidity'])

0.013455387658354731

# Test: get 3 largest tf-idf words in each document 

In [135]:
'''
Get words with top tfidf values in each document
'''
paper_rank_df_0 = list0_df.T

In [137]:
paper_rank_df_0.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,875,876,877,878,879,880,881,882,883,884
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008085,0.0,0.0,...,0.0,0.0,0.037268,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.017851,0.00538,0.0,0.0,0.003805,0.012282,0.0,0.0,0.0,...,0.0,0.003975,0.013408,0.006682,0.06023,0.006667,0.00204,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [145]:
list(paper_rank_df_0.nlargest(3, 1).index)

# Get top 20 words from each document and keep running tally 

In [148]:
'''
Essentially, we're getting the words that are the most important in the most documents.
Keep a running count of words that appear in each top-20 list that we generate.
'''
tfidf0 = {}
for col in paper_rank_df_0.columns:
    highest_tfidf_20 = list(paper_rank_df_0.nlargest(20, col).index)
    for word in highest_tfidf_20:
        if word not in tfidf0.keys():
            tfidf0[word] = 0
        else:
            tfidf0[word] += 1

In [149]:
tfidf0

{'pks': 0,
 'pk': 1,
 '901801': 0,
 'fmdv': 0,
 'replicons': 0,
 'replicon': 2,
 'replication': 13,
 'c11': 0,
 'pbluescript': 0,
 '01': 19,
 '10': 33,
 'doi': 96,
 'preprint': 246,
 'ptgfp': 0,
 'reuse': 31,
 'rights': 25,
 'permission': 21,
 'reserved': 20,
 'aatii': 0,
 'poly': 3,
 'hubei': 35,
 'cities': 25,
 'vulnerability': 4,
 'healthcare': 4,
 'resources': 1,
 'vulnerable': 1,
 'wuhan': 71,
 'epidemic': 71,
 'ncov': 83,
 'idi': 0,
 'public': 8,
 'xinyang': 1,
 'january': 35,
 'china': 39,
 'nhcc': 0,
 '2020': 109,
 'province': 22,
 'health': 10,
 'wu': 0,
 'localized': 0,
 'ibv': 8,
 'ampseq': 0,
 'samples': 19,
 'reads': 30,
 'minion': 4,
 'sequencing': 21,
 'isolates': 6,
 '634600': 0,
 'lineage': 2,
 'sample': 4,
 'genotypes': 5,
 'qpcr': 8,
 'rt': 17,
 'ga98': 0,
 'giv': 0,
 'lineages': 3,
 'ga08': 0,
 'sanger': 0,
 'l1': 1,
 'read': 9,
 'nvis': 0,
 'nipah': 0,
 'compounds': 7,
 'nvik': 0,
 'niv': 2,
 'fda': 2,
 'inhibitors': 4,
 'enamine': 0,
 'physicochemical': 0,
 'virus

In [160]:
# sort tfidf0
sorted_tfidf0 = {k: v for k, v in sorted(tfidf0.items(), key=lambda item: item[1], reverse=True)}

In [161]:
sorted_tfidf0

{'preprint': 246,
 'medrxiv': 238,
 'cases': 143,
 'fig': 136,
 'cov': 136,
 'covid': 136,
 'patients': 124,
 'sars': 119,
 'license': 112,
 '2020': 109,
 'doi': 96,
 'cells': 93,
 'perpetuity': 89,
 '19': 87,
 'ncov': 83,
 'granted': 80,
 'model': 78,
 'et': 76,
 'al': 73,
 'wuhan': 71,
 'epidemic': 71,
 '2019': 70,
 'display': 55,
 'protein': 55,
 'rna': 54,
 'number': 52,
 'transmission': 48,
 'cell': 42,
 'individuals': 40,
 'china': 39,
 'genes': 39,
 'binding': 39,
 'expression': 39,
 'viral': 38,
 'infected': 37,
 'virus': 36,
 'mice': 36,
 'hubei': 35,
 'january': 35,
 '03': 34,
 '10': 33,
 'days': 33,
 'ace2': 32,
 'reuse': 31,
 'data': 31,
 'figure': 31,
 'reads': 30,
 'outbreak': 29,
 'nc': 29,
 'host': 29,
 'severe': 29,
 'nd': 28,
 'age': 28,
 'sequences': 27,
 'clinical': 26,
 'infection': 26,
 'rights': 25,
 'cities': 25,
 'population': 25,
 'genome': 25,
 'period': 25,
 'estimates': 25,
 'time': 25,
 'species': 25,
 'onset': 25,
 'viruses': 23,
 'rate': 23,
 'rbd': 23,


# Explore metadata.csv 

In [55]:
metadata_df = pd.read_csv(os.path.join(data_path, "metadata.csv"))

In [56]:
metadata_df.shape

(44220, 15)

In [58]:
metadata_df.columns

Index(['sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license',
       'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_full_text',
       'full_text_file'],
      dtype='object')

In [59]:
metadata_df.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,custom_license
1,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,custom_license
2,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701.0,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,custom_license
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,custom_license
4,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285.0,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,custom_license


In [60]:
metadata_df['WHO #Covidence'].unique()

array([nan, '#1985', '#3329', ..., '#867', '#1600', '#5723'], dtype=object)

In [62]:
len(metadata_df['journal'].unique())

3946

In [70]:
len([j for j in metadata_df['journal']])

44220

In [72]:
type([j for j in metadata_df['journal']][0])

str

In [73]:
'American' in metadata_df['journal'][0]

True

In [76]:
set([type(j) for j in metadata_df['journal']])

{float, str}

In [80]:
[j for j in metadata_df['journal'] if type(j) == float]

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan

In [79]:
[j for j in metadata_df['journal'] if type(j) == str and 'American' not in j]

['Analytical Biochemistry',
 'Archives of Biochemistry and Biophysics',
 'Biochimica et Biophysica Acta (BBA) - Biomembranes',
 'Biochemical and Biophysical Research Communications',
 'Biochemical and Biophysical Research Communications',
 'Biochemical Pharmacology',
 'Biochemical Pharmacology',
 'Biochemical Pharmacology',
 'Biochemical Pharmacology',
 'Biochemical Pharmacology',
 'Biochemical Pharmacology',
 'Biological Psychiatry',
 'Brain Research',
 'Brain Research',
 'Brain Research',
 'Brain Research',
 'British Journal of Diseases of the Chest',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'British Veterinary Journal',
 'Brit

In [71]:
print([j for j in metadata_df['journal'] if 'American' not in j])

TypeError: argument of type 'float' is not iterable