In [1]:
import sqlite3
import json

In [2]:
conn = sqlite3.connect('../databases/CovidData.db')
cursor = conn.cursor()

In [3]:
create_research_table_sql = """
    CREATE TABLE IF NOT EXISTS covid_research (
        id TEXT PRIMARY KEY,
        source TEXT,
        pub_date TEXT,
        pub_timestamp TEXT,
        title TEXT,
        link TEXT,
        authors TEXT,
        authors_affiliations TEXT,
        keywords TEXT
    )
"""

create_keywords_table_sql = """
    CREATE TABLE IF NOT EXISTS covid_research_keywords (
        id TEXT PRIMARY KEY,
        keyword TEXT
    )
"""

create_abstracts_table_sql = """
    CREATE TABLE IF NOT EXISTS covid_research_abstracts (
        id TEXT PRIMARY KEY,
        abstract TEXT
    )
"""

cursor.execute(create_research_table_sql)
cursor.execute(create_keywords_table_sql)
cursor.execute(create_abstracts_table_sql)

<sqlite3.Cursor at 0x1e124eea8c0>

In [4]:
from datetime import datetime
import csv

list_delimiter = ', '

with open('../raw_data/combined_sources.json', 'r') as json_file:
    data = json.load(json_file)

    for item in data:
        if item['source'] == 'Europe PMC':
            datetime_object = datetime.strptime(
                item['date'], '%Y-%m-%d')
            datetime_timestamp = datetime.timestamp(datetime_object)

            cursor.execute("INSERT OR IGNORE INTO covid_research VALUES(?,?,?,?,?,?,?,?,?)", (
                item['id'],
                item['source'],
                str(datetime_object),
                str(datetime_timestamp),
                item['title'],
                item['link'],
                item['authors'],
                None,
                None,
            ))

        if item['source'] == 'Doaj':
            datetime_object = datetime.strptime(
                item['date'], '%Y-%m-%dT%H:%M:%SZ')
            datetime_timestamp = datetime.timestamp(datetime_object)

            authors_list = []
            author_affiliations_list = []
            for author in item['authors']:
                authors_list.append(author.get('name', ""))
                author_affiliations_list.append(author.get('affiliation', ""))

            keywords_list = []
            with open('keyword_stopwords.csv', 'r') as file:
                reader = csv.reader(file, delimiter=',')
                for row in reader:
                    for keyword in item['keywords']:
                        keywords_list.append(keyword)
                        if keyword.strip().lower().replace('-', '') not in row:
                            cursor.execute(
                                "INSERT OR IGNORE INTO covid_research_keywords VALUES(?,?)", (item['id'], keyword))

            cursor.execute("INSERT OR IGNORE INTO covid_research VALUES(?,?,?,?,?,?,?,?,?)", (
                item['id'],
                item['source'],
                str(datetime_object),
                str(datetime_timestamp),
                item['title'],
                list_delimiter.join([link['url'] for link in item['link']]),
                list_delimiter.join(authors_list),
                list_delimiter.join(author_affiliations_list),
                list_delimiter.join(keywords_list),
            ))

            cursor.execute("INSERT OR IGNORE INTO covid_research_abstracts VALUES(?,?)", (
                item['id'],
                item['abstract'],
            ))

        if item['source'] == 'PubMed':
            formatted_date_str = f"{item['date']['Year']}-{item['date']['Month']}-{item['date']['Day']}"
            datetime_object = datetime.strptime(formatted_date_str, '%Y-%m-%d')
            datetime_timestamp = datetime.timestamp(datetime_object)

            authors_list = []
            affiliations = []
            if isinstance(item['authors'], list):
                for author in item['authors']:
                    if author.get('AffiliationInfo'):
                        if isinstance(author['AffiliationInfo'], list):
                            for affiliation in author.get('AffiliationInfo', []):
                                affiliations.append(
                                    affiliation.get('Affiliation', ''))
                        elif isinstance(author['AffiliationInfo'], dict):
                            affiliations.append(author.get(
                                'AffiliationInfo', {}).get('Affiliation', ''))
                    authors_list.append(
                        f"{author.get('ForeName', '')} {author.get('LastName', '')}")

            keywords_list = []
            if isinstance(item['keywords'], dict):
                keywords_list.append(item['keywords']['#text'])
            else:
                for keyword in item.get('keywords', []):
                    keywords_list.append(keyword['#text'])

            for keyword in keywords_list:
                with open('keyword_stopwords.csv', 'r') as file:
                    reader = csv.reader(file, delimiter=',')
                    for row in reader:
                        if keyword.strip().lower().replace('-', '') not in row:
                            cursor.execute(
                                "INSERT OR IGNORE INTO covid_research_keywords VALUES(?,?)", (item['id'], keyword))

            title = item['title']
            if isinstance(item['title'], dict):
                title = item['title']['#text']

            cursor.execute("INSERT OR IGNORE INTO covid_research VALUES(?,?,?,?,?,?,?,?,?)", (
                item['id'],
                item['source'],
                str(datetime_object),
                str(datetime_timestamp),
                title,
                item['link'],
                list_delimiter.join(authors_list),
                list_delimiter.join(affiliations),
                list_delimiter.join(keywords_list),
            ))

            cursor.execute("INSERT OR IGNORE INTO covid_research_abstracts VALUES(?,?)", (
                item['id'],
                list_delimiter.join(item['abstract']),
            ))

In [5]:
cursor.execute(
    """DELETE FROM covid_research_keywords WHERE keyword IN ('Pandemic', 'pandemic', 'COVID-19', 'SARS-CoV-2', 'covid-19', 'Pandemics', '“COVID-19”', 'covid‐19', 'COVID‐19', 'SARS‐CoV‐2', 'SARS-CoV-2 pandemic', 'COVID-19 or SARS-CoV-2', 'pandemie', 'Coronavirus')""")

<sqlite3.Cursor at 0x1e124eea8c0>

In [6]:
conn.commit()
conn.close()