In [1]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# Regular expressions
import re

# Display errors in realtime
import ipywidgets as widgets

# work with dates
import datetime
import arrow
import time

# For scrubbing PII
import hashlib

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Import Pandas for easy importing of an excel file
import csv

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase10.sqlite")
    print("Removed template clone sams_data_phase10.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase10_template.sqlite","sams_data_phase10.sqlite")
    
    print("Created database from template: sams_data_phase10.sqlite")
except:
    pass

Created database from template: sams_data_phase10.sqlite


In [3]:
db = dataset.connect("sqlite:///sams_data_phase10.sqlite")

Some of the UOSSM facilities are colocated with the SAMS facilities. For the purposes of normalizing the format of opening months for the data at hand, we will ignore those relationships. Later, we will query the timeline table in a manner that takes those relationships into account.

There are three possible opening and closing date sets in the table. The relevant fields are:

```
dateopened --> dateclosed
reopened1  --> closed1
reopened2  --> closed2
```

Most of the facilities have a value only in the `dateopened` field. Facilities that have not closed will have the value `ongoing` in the relevant closed field and have a value of 1 in the `openstatus` field. 

Use the `openstatus` value to determine which group to process. Facilities are marked as being open or closed for the month in question.

In [4]:
tab_timeline = db['facility_timeline']

Given the date the data was provided, "ongoing" means "ongoing as of June 2017."

In [6]:
ongoing_date = arrow.get('2017-06-30','YYYY-MM-DD')

ongoing_date_month = int(ongoing_date.format("MM"))
ongoing_date_year  = int(ongoing_date.format("YYYY"))

print(ongoing_date_month,ongoing_date_year)

6 2017


In [22]:
def records_for_range(open_date,close_date,facility_id):
    """
    Given two dates that form a range, return records
    for insertion in the database that span the date
    range.
    """
    records = []
    active_date = arrow.get(open_date.ceil('month').date())
    close_date = arrow.get(close_date.ceil('month').date())
    
    while active_date <= close_date:
        active_year = int(active_date.format("YYYY"))
        active_month = int(active_date.format("MM"))
        rec = {"facility_id":facility_id, "year":active_year,"month":active_month}
        records.append(rec)
        active_date = active_date.shift(months=+1)
        
    return records

In [27]:
def extract_timeline_from_facility(facility_record):
    open1 = facility_record['dateopened']
    close1 = facility_record['dateclosed']
    open2 = facility_record['reopened1']
    close2 = facility_record['closed1']
    open3 = facility_record['reopened2']
    close3 = facility_record['closed2']
    
    openstatus = facility_record['openstatus']
    
    facility_id = facility_record['id']
    
    # There are no dates and the facility is closed
    if not open1 and openstatus == "0":
        return []
    
    facility_records = []
    
    if open1:
        # Case: facility has an open1 date
        open1 = arrow.get(open1,"YYYY-MM-DD")
        
        # Find the close date
        if close1 == "ongoing" or close1 is None:
            # Case: facility is still open
            facility_records = records_for_range(open1,ongoing_date,facility_id)
        else:
            close1 = arrow.get(close1,"YYYY-MM-DD")
            facility_records = records_for_range(open1,close1,facility_id)
            
    else:
        if openstatus == "1":
            # Case: facility has no dates but openstatus is 1
            
            rec = {"facility_id":facility_id,"year":ongoing_date_year,"month":ongoing_date_month}
            facility_records.append(rec)
            return facility_records
        else:
            # Case: facility has no dates but and openstatus is 0
            return []
        
    # TODO: Check that open2 is not ongoing
    if open2:
        open2 = arrow.get(open2,"YYYY-MM-DD")
        
        if close2 == "ongoing" or close2 is None:
            facility_records += records_for_range(open2,ongoing_date,facility_id)
        else:
            close2 = arrow.get(close2,"YYYY-MM-DD")
            facility_records += records_for_range(open2,close2,facility_id)
            
    if open3:
        open3 = arrow.get(open3,"YYYY-MM-DD")
        
        if close3 == "ongoing" or close3 is None:
            facility_records += records_for_range(open3,ongoing_date,facility_id)
        else:
            close3 = arrow.get(close3,"YYYY-MM-DD")
            facility_records += records_for_range(open3,close3,facility_id)
            
    return facility_records

In [33]:
tab_facilities = db['facilities']

In [39]:
facilities = [x for x in tab_facilities.find()]

Facility ID 265 had a close date but not an open date. Set to the same month for now. I did some manual work in the database to convert incorrectly formatted dates to the `YYYY-MM-DD` format for parsing.

In [41]:
# Get all the records that need to be put into the timeline table. 
all_facility_recs = []
for f in facilities:
    all_facility_recs += extract_timeline_from_facility(f)

In [43]:
tab_timeline.insert_many(all_facility_recs)

In [44]:
# Copy the database over as the template for the next file.
# This Notebook did not include manual editing of the data.

# Do not rerun this cell!
shutil.copy2('sams_data_phase10.sqlite','sams_data_phase11_template.sqlite')

'sams_data_phase11_template.sqlite'