### Assignment 2.1

In [12]:
import json
from pathlib import Path
import os

import pandas as pd


def read_cluster_csv(file_path):
    return pd.read_csv(file_path)


class KVDB(object):
    def __init__(self, db_path):
        self._db_path = Path(db_path)
        self._db = {}
        self._load_db()

    def _load_db(self):
        if self._db_path.exists():
            with open(self._db_path) as f:
                self._db = json.load(f)

    def get_value(self, key):
        return self._db.get(str(key))

    def set_value(self, key, value):
        self._db[str(key)] = value

    def save(self):
        with open(self._db_path, 'w') as f:
            json.dump(self._db, f, indent=2)


def create_sites_kvdb():
    db_dir = 'dsc650/assignments/assignment02/results/kvdb'
    db_path = Path(db_dir).resolve()
    db_path.mkdir(parents=True, exist_ok=True)

    db = KVDB(db_path.joinpath('sites.json'))
    df = read_cluster_csv('dsc650/data/external/tidynomicon/site.csv')
    for site_id, group_df in df.groupby('site_id'):
        db.set_value(site_id, group_df.to_dict(orient='records')[0])
    db.save()


def create_people_kvdb():
    db_dir = 'dsc650/assignments/assignment02/results/kvdb'
    db_path = Path(db_dir).resolve()
    db_path.mkdir(parents=True, exist_ok=True)

    db = KVDB(db_path.joinpath('people.json'))
    df = read_cluster_csv('dsc650/data/external/tidynomicon/person.csv')
    for _, row in df.iterrows():
        person_id = row['person_id']
        db.set_value(person_id, row.to_dict())
    db.save()


def create_visits_kvdb():
    db_dir = 'dsc650/assignments/assignment02/results/kvdb'
    db_path = Path(db_dir).resolve()
    db_path.mkdir(parents=True, exist_ok=True)

    db = KVDB(db_path.joinpath('visits.json'))
    df = read_cluster_csv('dsc650/data/external/tidynomicon/visited.csv')
    for _, row in df.iterrows():
        key = (row['visit_id'], row['site_id'])
        db.set_value(key, row.to_dict())
    db.save()


def create_measurements_kvdb():
    db_dir = 'dsc650/assignments/assignment02/results/kvdb'
    db_path = Path(db_dir).resolve()
    db_path.mkdir(parents=True, exist_ok=True)

    db = KVDB(db_path.joinpath('measurements.json'))
    df = read_cluster_csv('dsc650/data/external/tidynomicon/measurements.csv')
    for _, row in df.iterrows():
        key = (row['visit_id'], row['person_id'], row['quantity'])
        db.set_value(key, row.to_dict())
    db.save()


create_sites_kvdb()
create_people_kvdb()
create_visits_kvdb()
create_measurements_kvdb()





### Assignment 2.2

In [14]:
pip install tinydb


Collecting tinydb
  Downloading tinydb-4.8.0-py3-none-any.whl (24 kB)
Installing collected packages: tinydb
Successfully installed tinydb-4.8.0
Note: you may need to restart the kernel to use updated packages.


In [18]:
from pathlib import Path
import json
import os

from tinydb import TinyDB


current_dir = Path(os.getcwd()).absolute()
results_dir = current_dir.joinpath('results')
kv_data_dir = results_dir.joinpath('kvdb')
kv_data_dir.mkdir(parents=True, exist_ok=True)


class DocumentDB(object):
    def __init__(self, db_path):
        people_json = kv_data_dir.joinpath('people.json')
        visited_json = kv_data_dir.joinpath('visited.json')
        sites_json = kv_data_dir.joinpath('sites.json')
        measurements_json = kv_data_dir.joinpath('measurements.json')

        self._db_path = Path(db_path)
        self._db = None
        self._load_db()

    def _get_site(self, site_id):
        return self._site_lookup[str(site_id)]

    def _get_measurements(self, person_id):
        measurements = []
        for values in self._measurements_lookup.values():
            measurements.extend([value for value in values if str(value['person_id']) == str(person_id)])
        return measurements

    def _get_visit(self, visit_id):
        visit = self._visit_lookup.get(str(visit_id))
        site_id = visit['site_id']
        site = self._get_site(site_id)
        visit['site'] = site
        return visit

    def _load_db(self):
        self._db = TinyDB(self._db_path)

        persons = self._person_lookup.items()
        for person_id, record in persons:
            measurements = self._get_measurements(person_id)
            visit_ids = set([measurement['visit_id'] for measurement in measurements])
            visits = []
            for visit_id in visit_ids:
                visit = self._get_visit(visit_id)
                visit['measurements'] = [
                    measurement for measurement in measurements
                    if visit_id == measurement['visit_id']
                ]
                visits.append(visit)
            record['visits'] = visits
            self._db.insert(record)

    def add_person(self, person):
        self._db.insert(person)


db_path = results_dir.joinpath('patient-info3.json')
if db_path.exists():
    os.remove(db_path)


_person_lookup = {
    '1': {
        'name': 'John Doe',
        'age': 30,
        'gender': 'Male',
        'city': 'New York'
    },
    '2': {
        'name': 'Jane Smith',
        'age': 28,
        'gender': 'Female',
        'city': 'Los Angeles'
    }
}

_site_lookup = {
    '1': {
        'site_id': 1,
        'site_name': 'Site A'
    },
    '2': {
        'site_id': 2,
        'site_name': 'Site B'
    }
}

_measurements_lookup = {
    '1': [
        {
            'person_id': 1,
            'visit_id': 1,
            'measurement': 150
        },
        {
            'person_id': 1,
            'visit_id': 2,
            'measurement': 155
        }
    ],
    '2': [
        {
            'person_id': 2,
            'visit_id': 1,
            'measurement': 140
        }
    ]
}

_visit_lookup = {
    '1': {
        'visit_id': 1,
        'site_id': 1,
        'visit_date': '2023-06-19'
    },
    '2': {
        'visit_id': 2,
        'site_id': 2,
        'visit_date': '2023-06-20'
    }
}

db = DocumentDB(db_path)

# Example: Adding a person to the database
person1 = {
    "name": "John Doe",
    "age": 30,
    "gender": "Male",
    "city": "New York"
}

db.add_person(person1)

db._db.close()


### Assignment 2.3

In [16]:
import sqlite3

db_path = 'dsc650/dsc650/assignments/assignment02/results/patient-info3.db'
conn = sqlite3.connect(db_path)
conn.execute('PRAGMA busy_timeout = 0;')
conn.close()


In [18]:
import sqlite3
import pandas as pd

def create_measurements_table(conn):
    sql = """
    CREATE TABLE IF NOT EXISTS measurements (
        visit_id integer NOT NULL,
        person_id text NOT NULL,
        quantity text,
        reading real,
        FOREIGN KEY (visit_id) REFERENCES visits (visit_id),
        FOREIGN KEY (person_id) REFERENCES people (person_id)
    );
    """

    c = conn.cursor()
    c.execute(sql)

def create_people_table(conn):
    sql = """
    CREATE TABLE IF NOT EXISTS people (
        person_id text PRIMARY KEY,
        personal_name text,
        family_name text
    );
    """

    c = conn.cursor()
    c.execute(sql)

def create_sites_table(conn):
    sql = """
    CREATE TABLE IF NOT EXISTS sites (
        site_id text PRIMARY KEY,
        latitude real NOT NULL,
        longitude real NOT NULL
    );
    """

    c = conn.cursor()
    c.execute(sql)

def create_visits_table(conn):
    sql = """
    CREATE TABLE IF NOT EXISTS visits (
        visit_id integer PRIMARY KEY,
        site_id text NOT NULL,
        visit_date text,
        FOREIGN KEY (site_id) REFERENCES sites (site_id)
    );
    """

    c = conn.cursor()
    c.execute(sql)

def load_measurements_table(conn):
    create_measurements_table(conn)
    df = pd.read_csv('dsc650/data/external/tidynomicon/measurements.csv')
    measurements = df.values
    c = conn.cursor()
    c.execute('DELETE FROM measurements;')  # Delete data if exists
    c.executemany('INSERT INTO measurements VALUES (?,?,?,?)', measurements)

def load_people_table(conn):
    create_people_table(conn)
    df = pd.read_csv('dsc650/data/external/tidynomicon/person.csv')
    people = df.values
    c = conn.cursor()
    c.execute('DELETE FROM people;')  # Delete data if exists
    c.executemany('INSERT INTO people VALUES (?,?,?)', people)

def load_sites_table(conn):
    create_sites_table(conn)
    df = pd.read_csv('dsc650/data/external/tidynomicon/sites.csv')
    sites = df.values
    c = conn.cursor()
    c.execute('DELETE FROM sites;')  # Delete data if exists
    c.executemany('INSERT INTO sites VALUES (?,?,?)', sites)

def load_visits_table(conn):
    create_visits_table(conn)
    df = pd.read_csv('dsc650/data/external/tidynomicon/visits.csv')
    visits = df.values
    c = conn.cursor()
    c.execute('DELETE FROM visits;')  # Delete data if exists
    c.executemany('INSERT INTO visits VALUES (?,?,?)', visits)

# Create the database and load tables
db_path = results_dir.joinpath('patient-info3.db')
conn = sqlite3.connect(str(db_path))

load_measurements_table(conn)
load_people_table(conn)
load_sites_table(conn)
load_visits_table(conn)

conn.commit()
conn.close()



OperationalError: database is locked

### Assignment 2.4

In [21]:
pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib>=6.1.1
  Downloading rdflib-6.3.2-py3-none-any.whl (528 kB)
     ------------------------------------- 528.1/528.1 kB 11.3 MB/s eta 0:00:00
Collecting isodate<0.7.0,>=0.6.0
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
     ---------------------------------------- 41.7/41.7 kB ? eta 0:00:00
Installing collected packages: isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.6.1 rdflib-6.3.2
Note: you may need to restart the kernel to use updated packages.


In [23]:
from pathlib import Path
import json
from SPARQLWrapper import SPARQLWrapper, JSON

# Set the SPARQL endpoint
endpoint_url = "https://query.wikidata.org/sparql"

# Modify the query to change the column order
query = """
SELECT ?date ?event ?eventLabel
WHERE
{
    ?event wdt:P31/wdt:P279* wd:Q1190554.
    OPTIONAL { ?event wdt:P585 ?date. }
    OPTIONAL { ?event wdt:P580 ?date. }
    FILTER(BOUND(?date) && DATATYPE(?date) = xsd:dateTime).
    BIND(NOW() - ?date AS ?distance).
    FILTER(0 <= ?distance && ?distance < 31).
    OPTIONAL {
        ?event rdfs:label ?eventLabel.
        FILTER(LANG(?eventLabel) = "en").
    }
}
LIMIT 10
"""

# Create the SPARQL wrapper
sparql = SPARQLWrapper(endpoint_url)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

# Execute the query and retrieve the results
results = sparql.query().convert()

# Create the output directory if it doesn't exist
output_dir = Path("dsc650/assignments/assignment02/results")
output_dir.mkdir(parents=True, exist_ok=True)

# Define the output file path
output_file = output_dir / "wikidata-query.json"


# Save the results to the JSON file
with open(output_file, "w") as f:
    json.dump(results, f)
