In [1]:
import psycopg2
import sqlite3
from pathlib import Path
import pandas as pd
import csv

In [2]:
# TABLE_NAME = "ReportedProcedures"
TABLE_NAME = "HospitalProcedure"

INPUT_PATH = Path("reported_procedures")
COLUMNS = [
    {"name" : "hospital_name"},
    {"name": "cpt_code"},
    {"name": "procedure_name"},
    {"name": "average_charge"},
]


# ### Postgres
# conn = psycopg2.connect(database="HealthcareSense",
# 						user='felixhab', password='',
# 						host='127.0.0.1', port='5432'
# )
# conn.autocommit = True 

### SQLite
database_dir = Path(".") / "database"
database_path = database_dir / 'healthcare_data.db'
database_dir.mkdir(exist_ok=True)

def get_conn():
    return sqlite3.connect(database_path)
conn = get_conn()

In [3]:
cursor = conn.cursor()

sql0 = f"""DROP TABLE {TABLE_NAME}"""

# Don't execute this if the database is brand new. Table would not exist. 
cursor.execute(sql0)


sql = f'''CREATE TABLE {TABLE_NAME}(
id int NOT NULL AUTO_INCREMENT,
{COLUMNS[0]["name"]} varchar(100) NOT NULL,
{COLUMNS[1]["name"]} int NOT NULL,
{COLUMNS[2]["name"]} varchar(300), 
{COLUMNS[3]["name"]} int);'''


cursor.execute(sql)

conn.commit()


In [4]:
def get_names(columns):
    return str(tuple([col["name"] for col in columns]))

cursor = conn.cursor()

# ### Works only in postgres
# for output_csv_path in INPUT_PATH.iterdir():
#     if output_csv_path.suffix != '.csv':
#         continue

#     sql2 = f'''COPY {TABLE_NAME}(hospital_name,cpt_code,
#     procedure_name,average_charge)
#     FROM '{output_csv_path.absolute()}'
#     DELIMITER ','
#     CSV HEADER;'''
        
#     # TODO: Handle filename escaping
#     try:
#         cursor.execute(sql2)
#     except:
#         print(f"Failed to load csv file {output_csv_path}")

for output_csv_path in INPUT_PATH.iterdir():
    # print(f"Reading {output_csv_path}")
    if output_csv_path.suffix != ".csv":
        print(f"{output_csv_path} is not a csv file")
        continue
        
    with open(output_csv_path) as f:
        contents = csv.reader(f)
        # Skip CSV header
        next(contents, None)
        
        sql2 = f"""
        INSERT INTO {TABLE_NAME} {get_names(COLUMNS)} VALUES(?,?,?,?);
        """
        cursor.executemany(sql2, contents)
    
conn.commit()

reported_procedures/.ipynb_checkpoints is not a csv file


In [5]:
cursor = conn.cursor()

sql3 = f'''
select * from {TABLE_NAME} 
ORDER BY average_charge;
'''
result = pd.read_sql(sql3, conn)
# cents to dollars
result.average_charge *=.01

conn.commit()

result

Unnamed: 0,hospital_name,cpt_code,procedure_name,average_charge
0,Garfield Medical Center,93000,"Electrocardiogram, routine, with interpretatio...",0.00
1,Garfield Medical Center,66821,"Discission, secondary membranous cataract, las...",0.00
2,Childrens Hospital of Orange County,77067,"Mammography, Screening, Bilateral",0.00
3,Childrens Hospital of Orange County,76805,"Ultrasound, OB, 14 weeks or more, transabdominal",0.00
4,Childrens Hospital of Orange County,93452,"Cardiac Catheterization, Left Heart, percutane...",0.00
...,...,...,...,...
4862,Garfield Medical Center,22523,"Percutaneous vertebral augmentation, including...",75296.00
4863,Garfield Medical Center,29826,"Arthroscopy, Shoulder, with partial acromioplasty",79776.00
4864,Providence Santa Rosa Memorial Hospital,47562,Laparoscopic Cholecystectomy,92634.99
4865,Garfield Medical Center,33208,Insert new or replace of permanent pacemaker w...,122912.00


In [6]:
conn.close()