In [None]:
# !pip install pymongo # please comment this line before submission
# import all libs (do not change)
# @author Evan Litzer
from pymongo import MongoClient
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import json
import pymongo
import pprint
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor

In [177]:
# fill in uri (5pts)
uri = "mongodb://localhost:27017/"
# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    capture = client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!", capture)
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB! {'ok': 1.0}


In [178]:
# database (do not change)
db = client['test']

if 'litcovidtest' in db.list_collection_names():
    db['litcovidtest'].drop()

posts = db['litcovidtest']



In [179]:
# Loading or Opening the json file
with open('data/litcovid2BioCJSON_small.json') as file:
    file_data = json.load(file)

# Inserting the loaded data in the Collection
# if JSON contains data more than one entry
# insert_many is used else insert_one is used
# fill in (5 pts)

# Insert into MongoDB
if isinstance(file_data, list):
    posts.insert_many(file_data)
else:
    posts.insert_one(file_data)


In [180]:
# Count the number of documents in this corpus
# fill in (10 pts)
result1= posts.count_documents({})
print("Count the number of documents in this corpus", result1)

Count the number of documents in this corpus 1000


In [181]:
# find the fields for the first document in this corpus
# fill in (10 pts)
result2= posts.find_one()
pprint.pprint(result2)

{'_id': '32911311|PMC7832150',
 'authors': ['Gupta NA', 'Lien C', 'Iv M'],
 'id': '7832150',
 'infons': {},
 'journal': 'Clin Imaging',
 'passages': [{'infons': {'article-id_doi': '10.1016/j.clinimag.2020.08.029',
                          'article-id_pii': 'S0899-7071(20)30332-6',
                          'article-id_pmc': '7832150',
                          'article-id_pmid': '32911311',
                          'authors': 'Gupta NA, Lien C, Iv M, ',
                          'fpage': '239',
                          'journal': 'Clin Imaging; 2020 Dec ; 68 239-241. '
                                     'doi:10.1016/j.clinimag.2020.08.029',
                          'kwd': 'Cerebral microbleeds COVID-19 Neuroimaging',
                          'license': 'Since January 2020 Elsevier has created '
                                     'a COVID-19 resource centre with free '
                                     'information in English and Mandarin on '
                               

In [182]:
# Count the number of publications for each journal. Sort the result in descending order and print journals with more than 4 publications

# fill in (10 pts)
result3= posts.aggregate([
    {"$group": {"_id": "$journal", "count": {"$sum": 1}}},
    {"$match": {"count": {"$gt": 4}}},
    {"$sort": {"count": -1}}
])
for post in result3:
    pprint.pprint(post)

{'_id': '', 'count': 137}
{'_id': 'BMJ', 'count': 12}
{'_id': 'Nature', 'count': 10}
{'_id': 'PLoS One', 'count': 9}
{'_id': 'Cureus', 'count': 7}
{'_id': 'Sci Rep', 'count': 6}
{'_id': 'Gastroenterology', 'count': 5}
{'_id': 'N Engl J Med', 'count': 5}
{'_id': 'Am J Infect Control', 'count': 5}


In [183]:
# Find all papers published in PLoS One journal. Print their years and titles
# fill in (10 pts)
result4= posts.find({"journal": "PLoS One"})
for post in result4:
    pprint.pprint(post['passages'][0]['infons']['year'])
    pprint.pprint(post['passages'][0]['text'])

'2021'
('Sex differences in susceptibility, severity, and outcomes of coronavirus '
 'disease 2019: Cross-sectional analysis from a diverse US metropolitan area')
'2021'
('Correction: Comparative analysis of various clinical specimens in detection '
 'of SARS-CoV-2 using rRT-PCR in new and follow up cases of COVID-19 '
 'infection: Quest for the best choice.')
'2020'
"A synthetic indicator on the impact of COVID-19 on the community's health."
'2021'
'SARS-CoV-2 infection in asymptomatic healthcare workers at a clinic in Chile'
'2021'
('Are older adults of Rohingya community (Forcibly Displaced Myanmar Nationals '
 'or FDMNs) in Bangladesh fearful of COVID-19? Findings from a cross-sectional '
 'study')
'2021'
('Closed for business: The mortality impact of business closures during the '
 'Covid-19 pandemic')
'2020'
('Persistence of symptoms and quality of life at 35 days after hospitalization '
 'for COVID-19 infection')
'2021'
('Factors indicating intention to vaccinate with a COVID-19

In [184]:
# Count the number of publications for each author. Sort the results in descending order and return authors with 5 or more publications
# fill in (10 pts)
result5= posts.aggregate([{"$unwind": "$authors"}, {"$group": {"_id": "$authors", "count": {"$sum": 1}}}, {"$match": {"count": {"$gte": 5}}}, {"$sort": {"count": -1}}])
for post in result5:
    pprint.pprint(post)

{'_id': 'Zhang Y', 'count': 12}
{'_id': 'Wang J', 'count': 8}
{'_id': 'Wang S', 'count': 7}
{'_id': 'Zhang J', 'count': 7}
{'_id': 'Li X', 'count': 7}
{'_id': 'Yang Y', 'count': 7}
{'_id': 'Zhang L', 'count': 7}
{'_id': 'Li Y', 'count': 7}
{'_id': 'Li H', 'count': 6}
{'_id': 'Zhang X', 'count': 6}
{'_id': 'Wang Z', 'count': 6}
{'_id': 'Liu Y', 'count': 6}
{'_id': 'Li J', 'count': 6}
{'_id': 'Zheng Y', 'count': 5}
{'_id': 'Shi Y', 'count': 5}
{'_id': 'Liu J', 'count': 5}
{'_id': 'Wang X', 'count': 5}
{'_id': 'Wang Y', 'count': 5}


In [185]:
# Find the papers co-written by ‘Wang J’ and 'Zhang L', print the paper pmids, journal names and titles
# fill in (10 pts)
result6 = posts.find({"authors": {"$all": ["Wang J", "Zhang L"]}})
for post in result6:
    pprint.pprint(post['pmid'])
    pprint.pprint(post['journal'])
    pprint.pprint(post['passages'][0]['text'])

34124189
'Front Cardiovasc Med'
('Early vs. Late Onset Cardiac Injury and Mortality in Hospitalized COVID-19 '
 'Patients in Wuhan.')
34184314
'Echocardiography'
('Incremental prognostic value of biventricular longitudinal strain and '
 'high-sensitivity troponin I in COVID-19 patients.')


In [186]:
# Create text index on passages.text
# fill in
posts.create_index([("passages.text", "text")])

'passages.text_text'

In [187]:
# count the number of publications that contains the phrase "COVID-19 Vaccine"
# fill in (10 pts)
result7= posts.count_documents({"$text": {"$search": "\"COVID-19 Vaccine\""}})
print("Count the number of publications that contains the phrase 'COVID-19 Vaccine'", result7)

Count the number of publications that contains the phrase 'COVID-19 Vaccine' 46


In [188]:
# count the number of publications that contains the words "COVID-19" or "Sars-CoV-2"
# fill in (10 pts)
result8= posts.count_documents({"$text": {"$search": "COVID-19 Sars-CoV-2"}})
print("Count the number of publications that contains the words 'COVID-19' or 'Sars-CoV-2'", result8)

Count the number of publications that contains the words 'COVID-19' or 'Sars-CoV-2' 957


In [189]:
# count the number of publications that contains the words "COVID-19" and "Sars-CoV-2"
# fill in (10 pts)
result9= posts.count_documents({"$text": {"$search": "\"COVID-19\" \"Sars-CoV-2\""}})
print("Count the number of publications that contains the words 'COVID-19' and 'Sars-CoV-2'", result9)

Count the number of publications that contains the words 'COVID-19' and 'Sars-CoV-2' 384
