In [1]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# Regular expressions
import re

# Display errors in realtime
import ipywidgets as widgets

# work with dates
import datetime
import arrow
import time

# For scrubbing PII
import hashlib

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Import Pandas for easy importing of an excel file
import csv

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase13.sqlite")
    print("Removed template clone sams_data_phase13.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase13_template.sqlite","sams_data_phase13.sqlite")
    
    print("Created database from template: sams_data_phase13.sqlite")
except:
    pass

Removed template clone sams_data_phase13.sqlite
Created database from template: sams_data_phase13.sqlite


Changes were made to the facilities table to include parenting information and codes for SAMS facilities. This CSV file should replace the facilities table in the database.

In [3]:
fac_file = "facility_table_hydration_16-NOV-2017.csv"

# These columns should be inserted as integers
int_cols = [
    'id',
    'openstatus',
    'country_id',
    'governorate_id',
    'district_id',
    'subdistrict_id',
    'aleppo',
    'facility_parent_id'
]

# These columns should be NULL if they are an empty string.
null_cols = [
    'dateopened',
    'dateclosed',
    'reopened1',
    'closed1',
    'reopened2',
    'closed2',
    'phc',
    'rh',
    'mentalhealth',
    'orthopedic',
    'icu',
    'er',
    'pediatric',
    'dental',
    'organization',
    'facility_type',
    'governorate',
    'district',
    'subdistrict',
    'country_id',
    'governorate_id',
    'district_id',
    'subdistrict_id',
    'aleppo',
    'facility_parent_id',
    'needs_review',
    'duplicate_note'
]

facility_data = []

with open(fac_file,'r',encoding='utf-8-sig') as csvfile:
    csvreader = csv.reader(csvfile)
    
    # Get the header
    header = next(csvreader)
    
    # Iterate through the rows after the header
    for row in csvreader:
        
        # Create a dict out of each row, using the 
        # parallel header values as the keys for the dict
        rowdict = dict(zip(header,row))
        
        for col in int_cols:
            try:
                rowdict[col] = int(rowdict[col])
            except:
                pass
            
        for col in null_cols:
            try:
                if rowdict[col].strip() == '':
                    rowdict[col] = None
            except:
                pass
            
        facility_data.append(rowdict)
            

In [4]:
db = dataset.connect("sqlite:///sams_data_phase13.sqlite")

In [5]:
# Remove the existing table and then add the imported data as the new table.
# This should pretty much be a drop-in replacement
try:
    db['facilities'].drop()
except:
    pass

tab_facilities = db['facilities']

In [6]:
for row in facility_data:
    tab_facilities.insert(row)

There are few facilities called out as being problematic. Address those here. Note that there is no facility with id 70 and I'm not sure why that is the case. Perhaps it was an obvious duplicate and deleted? The only one marked as delete is facility with id number 309.

Note: I'm leaving all of them in the facilities table for now because the files reference some of the "duplicates."

----

#### Import the aggregate SAMS data and transform it, then store it in the database

In [7]:
agg_file = 'sams_monthly_aggregate_with_id_16-NOV-2017.csv'

In [8]:
raw_aggregate_data = []

with open(agg_file,'r',encoding='utf-8-sig') as csvfile:
    csvreader = csv.reader(csvfile)
    
    # Get the header
    header = next(csvreader)
    
    # Iterate through the rows after the header
    for row in csvreader:
        
        # Create a dict out of each row, using the 
        # parallel header values as the keys for the dict
        rowdict = dict(zip(header,row))
        raw_aggregate_data.append(rowdict)
        

In [9]:
cleaned_aggregate_data = []

desc_keys = [
    "facility_code",
    "facility_name",
    "country",
    "governorate",
    "district",
    "year",
    "month",
    "day",
    "date"
]

for row in raw_aggregate_data:
    facility_code = row['facility_code'].strip()
    facility_name = row['facility_name'].strip()
    country = row['country'].strip()
    governorate = row['governorate'].strip()
    district = row['district'].strip()
    year = int(row['year'])
    month = int(row['month'])
    day = int(row['day'])
    date = arrow.get(row['date'],'M/D/YYYY').date()
    
    for key in row.keys():
        if key in desc_keys:
            continue
        
        value = row[key]
        if value is None:
            continue
        if value.strip() == '':
            continue
        
        try:
            int_val = int(value)
            if int_val == 0:
                continue
            else:
                data_point = {}
                data_point['facility_code'] = facility_code
                data_point['facility_name'] = facility_name
                data_point['country'] = country
                data_point['governorate'] = governorate
                data_point['district'] = district
                data_point['year'] = year
                data_point['month'] = month
                data_point['day'] = day
                data_point['date'] = date
                data_point['metric'] = key.strip()
                data_point['metric_value'] = int_val
                
                cleaned_aggregate_data.append(data_point)
        except:
            pass

In [10]:
len(cleaned_aggregate_data)

28968

In [11]:
tab_agg = db['aggregate_sams']

In [12]:
tab_agg.insert_many(cleaned_aggregate_data)

In [13]:
# Copy the database over as the template for the next file.

# Do not rerun this cell!
shutil.copy2('sams_data_phase13.sqlite','sams_data_phase14_template.sqlite')

'sams_data_phase14_template.sqlite'