In [90]:
import sqlite3
import json

In [91]:
conn = sqlite3.connect('../databases/CovidData.db')
cursor = conn.cursor()

In [92]:

cursor.execute('''
    CREATE TABLE IF NOT EXISTS covid_research (
        id TEXT PRIMARY KEY,
        source TEXT,
        pub_date TEXT,
        pub_timestamp TEXT,
        title TEXT,
        link TEXT,
        authors TEXT,
        authors_affiliations TEXT,
        keywords TEXT
    )
''')

cursor.execute('''
    CREATE TABLE IF NOT EXISTS covid_research_keywords (
        id INTEGER PRIMARY KEY,
        keyword TEXT,
        article_id TEXT,
        count INTEGER
    )
''')

<sqlite3.Cursor at 0x1eb55fd2240>

In [93]:
from datetime import datetime


def fetch_keywords_by_value(keyword):
    sql = "SELECT * FROM covid_research_keywords WHERE keyword LIKE ?"
    cursor.execute(sql, (f"%{keyword}%",))  # Use wildcards with LIKE
    rows = cursor.fetchall()
    return rows


with open('../data/combined_sources.json', 'r') as json_file:
    data = json.load(json_file)
    list_delimiter = ', '
    keywords_stopwords = ['COVID', 'COVID-19', 'SARS-CoV-2', 'covid-19', 'Covid-19', 'covid-19 pandemic',
                          'SARS-COV-2', 'кластерний аналіз', 'нейронна мережа', 'машинне навчання', 'епідемічний процес']

    for item in data:
        if item['source'] == 'Doaj':
            # format journal date
            datetime_object = datetime.strptime(
                item['date'], '%Y-%m-%dT%H:%M:%SZ')
            datetime_timestamp = datetime.timestamp(datetime_object)

            # format authors and affiliations lists
            authors_list = []
            author_affiliations_list = []
            for author in item['authors']:
                authors_list.append(author.get('name', ""))
                author_affiliations_list.append(author.get('affiliation', ""))

            for keyword in item['keywords']:
                if keyword not in keywords_stopwords:
                    # if fetch_keywords_by_value(keyword):
                    #     # add article id to list
                    # else:
                    print("EXISTING KEYWORD: %s", fetch_keywords_by_value(keyword))
                    cursor.execute(
                        "INSERT OR IGNORE INTO covid_research_keywords VALUES(?,?,?)", (None, keyword, item['id']))

            # insert db row
            cursor.execute("INSERT OR IGNORE INTO covid_research VALUES(?,?,?,?,?,?,?,?,?)", (
                item['id'],
                item['source'],
                str(datetime_object),
                str(datetime_timestamp),
                item['title'],
                list_delimiter.join([link['url'] for link in item['link']]),
                list_delimiter.join(authors_list),
                list_delimiter.join(author_affiliations_list),
                list_delimiter.join(item['keywords'])
            ))

        if item['source'] == 'PubMed':
            # format journal date
            formatted_date_str = f"{item['date']['Year']}-{item['date']['Month']}-{item['date']['Day']}"
            datetime_object = datetime.strptime(formatted_date_str, '%Y-%m-%d')
            datetime_timestamp = datetime.timestamp(datetime_object)

            # format authors and affiliations lists
            authors_list = []
            affiliations = []
            if isinstance(item['authors'], list):
                for author in item['authors']:
                    if author.get('AffiliationInfo'):
                        if isinstance(author['AffiliationInfo'], list):
                            for affiliation in author.get('AffiliationInfo', []):
                                affiliations.append(
                                    affiliation.get('Affiliation', ''))
                        elif isinstance(author['AffiliationInfo'], dict):
                            affiliations.append(author.get(
                                'AffiliationInfo', {}).get('Affiliation', ''))
                    authors_list.append(
                        f"{author.get('ForeName', '')} {author.get('LastName', '')}")

            # format keywords lists
            keywords_list = []
            for keyword in item.get('keywords', []):
                keywords_list.append(keyword['#text'])
                if keyword['#text'] not in keywords_stopwords:
                    cursor.execute(
                        "INSERT OR IGNORE INTO covid_research_keywords VALUES(?,?,?)", (None, keyword['#text'], item['id']))

            # insert db row
            cursor.execute("INSERT OR IGNORE INTO covid_research VALUES(?,?,?,?,?,?,?,?,?)", (
                item['id'],
                item['source'],
                str(datetime_object),
                str(datetime_timestamp),
                item['title'],
                item['link'],
                list_delimiter.join(authors_list),
                list_delimiter.join(affiliations),
                list_delimiter.join(keywords_list)
            ))

In [94]:
conn.commit()
conn.close()