In [1]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# Regular expressions
import re

# Display errors in realtime
import ipywidgets as widgets

# work with dates
import datetime
import arrow
import time

# For scrubbing PII
import hashlib

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Import Pandas for easy importing of an excel file
import csv

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase15.sqlite")
    print("Removed template clone sams_data_phase15.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase15_template.sqlite","sams_data_phase15.sqlite")
    
    print("Created database from template: sams_data_phase15.sqlite")
except:
    pass

Removed template clone sams_data_phase15.sqlite
Created database from template: sams_data_phase15.sqlite


In [3]:
db = dataset.connect("sqlite:///sams_data_phase15.sqlite")

In [4]:
tab_locations = db['locations']
tab_facilities = db['facilities']    

In [5]:
try:
    db['facility_timeline'].drop()
except:
    pass

tab_timeline = db['facility_timeline']

In [6]:
ongoing_date = arrow.get('2017-06-30','YYYY-MM-DD')

ongoing_date_month = int(ongoing_date.format("MM"))
ongoing_date_year  = int(ongoing_date.format("YYYY"))

print(ongoing_date_month,ongoing_date_year)

6 2017


In [7]:
def records_for_range(open_date,close_date,facility_id):
    """
    Given two dates that form a range, return records
    for insertion in the database that span the date
    range.
    """
    records = []
    active_date = arrow.get(open_date.ceil('month').date())
    close_date = arrow.get(close_date.ceil('month').date())
    
    while active_date <= close_date:
        active_year = int(active_date.format("YYYY"))
        active_month = int(active_date.format("MM"))
        rec = {"facility_id":facility_id, "year":active_year,"month":active_month}
        records.append(rec)
        active_date = active_date.shift(months=+1)
        
    return records

In [8]:
def extract_timeline_from_facility(facility_record):
    open1 = facility_record['dateopened']
    close1 = facility_record['dateclosed']
    open2 = facility_record['reopened1']
    close2 = facility_record['closed1']
    open3 = facility_record['reopened2']
    close3 = facility_record['closed2']
    
    openstatus = facility_record['openstatus']
    
    facility_id = facility_record['id']
    
    # There are no dates and the facility is closed
    if not open1 and openstatus == "0":
        return []
    
    facility_records = []
    
    if open1:
        # Case: facility has an open1 date
        open1 = arrow.get(open1,"YYYY-MM-DD")
        
        # Find the close date
        if close1 == "ongoing" or close1 is None:
            # Case: facility is still open
            facility_records = records_for_range(open1,ongoing_date,facility_id)
        else:
            close1 = arrow.get(close1,"YYYY-MM-DD")
            facility_records = records_for_range(open1,close1,facility_id)
            
    else:
        if openstatus == "1":
            # Case: facility has no dates but openstatus is 1
            
            rec = {"facility_id":facility_id,"year":ongoing_date_year,"month":ongoing_date_month}
            facility_records.append(rec)
            return facility_records
        else:
            # Case: facility has no dates but and openstatus is 0
            return []
        
    # TODO: Check that open2 is not ongoing
    if open2:
        open2 = arrow.get(open2,"YYYY-MM-DD")
        
        if close2 == "ongoing" or close2 is None:
            facility_records += records_for_range(open2,ongoing_date,facility_id)
        else:
            close2 = arrow.get(close2,"YYYY-MM-DD")
            facility_records += records_for_range(open2,close2,facility_id)
            
    if open3:
        open3 = arrow.get(open3,"YYYY-MM-DD")
        
        if close3 == "ongoing" or close3 is None:
            facility_records += records_for_range(open3,ongoing_date,facility_id)
        else:
            close3 = arrow.get(close3,"YYYY-MM-DD")
            facility_records += records_for_range(open3,close3,facility_id)
            
    return facility_records

In [9]:
facilities = [x for x in tab_facilities.find()]

In [10]:
# Get all the records that need to be put into the timeline table. 
all_facility_recs = []
for f in facilities:
    all_facility_recs += extract_timeline_from_facility(f)

In [11]:
tab_timeline.insert_many(all_facility_recs)

In [12]:
# Query to return the result set that allows us to export to a timeline in Excel

results = db.query("""
SELECT facility_timeline.year,
	   facility_timeline.month,
	   facilities.country,
	   facilities.governorate,
	   facilities.district,
	   facilities.subdistrict,
	   facilities.facility_type,
	   facilities.id as facility_id,
	   facilities.facility_parent_id,
       facilities.facility_code,
	   facilities.facilityname,
       facilities.phc,
       facilities.rh,
       facilities.mentalhealth,
       facilities.orthopedic,
       facilities.icu,
       facilities.er,
       facilities.pediatric,
       facilities.dental
FROM facility_timeline
JOIN facilities ON facility_timeline.facility_id = facilities.id;

""")

data = [x for x in results]

In [13]:
poss_dates = []

for x in range(2011,2018):
    for y in range(1,13):
        pd = str(x) + "-" + str(y)
        poss_dates.append(pd)

In [14]:
header = [
    "country",
    "governorate",
    "district",
    "subdistrict",
    "facility_type",
    "facility_id",
    "facility_parent_id",
    "facility_code",
    "facilityname",
    "phc",
    "rh",
    "mentalhealth",
    "orthopedic",
    "icu",
    "er",
    "pediatric",
    "dental"
]

In [15]:
# Uncomment to see the data structure
# data[0]

In [16]:
clean_data = []

for d in data:
    rec = {}
    for k in d.keys():
        if d[k] is None:
            rec[k] = ''
        else:
            rec[k] = d[k]

    rec['month'] = str(rec['month'])
    rec['year'] = str(rec['year'])
    
    if 'SAMS' in rec['facility_code']:
        rec['org'] = 'SAMS'
    else:
        rec['org'] = 'UOSSM'
        
    clean_data.append(rec)

In [17]:
header = [
    "country",
    "governorate",
    "district",
    "subdistrict",
    "facility_type",
    "facility_id",
    "facility_parent_id",
    "facility_code",
    "org",
    "facilityname",
    "phc",
    "rh",
    "mentalhealth",
    "orthopedic",
    "icu",
    "er",
    "pediatric",
    "dental"
]

In [18]:
# Uncomment to see the data structure
# clean_data[0]

In [19]:
lookup = {}

for rec in clean_data:
    fn = rec['facilityname']
    if fn not in lookup.keys():
        lookup[fn] = {}
        for h in header:
            lookup[fn][h] = rec[h]
    
    open_key = rec['year'] + "-" + rec['month']
    lookup[fn][open_key] = 'Y'

In [20]:
# Uncomment to see the data structure
# lookup['Jebel Saman Mobile Clinic']

In [21]:
for fac in lookup.keys():
    for pd in poss_dates:
        if pd not in lookup[fac].keys():
            lookup[fac][pd] = ''

In [22]:
# Uncomment to see the data structure
# lookup['Jebel Saman Mobile Clinic']

In [23]:
header = header + poss_dates

In [24]:
# Export the timeline with the opening and closing dates of the facilities

with open("02-DEC-2017_SAMS_and_UOSSM_facility_open_dates.csv",'w') as outfile:
    outfile.write(",".join(header))
    outfile.write("\n")
    for fac in lookup.keys():
        row = []
        for h in header:
            row.append(str(lookup[fac][h]))
        outfile.write(",".join(row))
        outfile.write("\n")

In [27]:
# Remove arabic_values that have PII and clean up the database.

# Drop the `aleppo_arabic_values` table
db['aleppo_arabic_values'].drop()

True

In [28]:
# Arabic values that have PII - identified with a manual scan; possibly not comprehensive

db.query("""
DELETE FROM arabic_values WHERE id BETWEEN 2056 AND 2185;
""")

<dataset.persistence.util.ResultIter at 0x20c436a35f8>

In [29]:
# Remove another instance
db.query("""
DELETE FROM arabic_values WHERE id = 2193;
""")

<dataset.persistence.util.ResultIter at 0x20c436a3898>

In [30]:
# Copy the database over as the template for the next file.
# This Notebook did not include manual editing of the data.

# Do not rerun this cell!
shutil.copy2('sams_data_phase15.sqlite','sams_data_phase16_template.sqlite')

'sams_data_phase16_template.sqlite'