In [1]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# Regular expressions
import re

# Display errors in realtime
import ipywidgets as widgets

# work with dates
import datetime
import arrow
import time

# For scrubbing PII
import hashlib

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Import Pandas for easy importing of an excel file
import csv

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase11.sqlite")
    print("Removed template clone sams_data_phase11.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase11_template.sqlite","sams_data_phase11.sqlite")
    
    print("Created database from template: sams_data_phase11.sqlite")
except:
    pass

Created database from template: sams_data_phase11.sqlite


This notebook is redundant and the following one will also be redundant. I did that to maintain an execution order that mimics the order in which I processed the data. The problem is that the list of facilities originally provided by SAMS was lacking several, which led to the inability to map facilities to files. 

Some of those facilities are imported here. Given that they are lacking dates for open and close, I stubbed in 6/15/2017 and hope that we get better data in coming days. Upon receiving further updates, the best process will be to update the additional_facilities.csv file and then rerun this and subsequent notebooks. 

In [3]:
additional_facilities_file = "additional_facilities_08-NOV-2017.csv"

# This includes only two columns: id and facility_id. The id column is the id in the files table.
updated_files_file = "updated_files_08-NOV-2017.csv"

In [4]:
data_facilities = []

with open(additional_facilities_file,'r',encoding='utf-8-sig') as csvfile:
    csvreader = csv.reader(csvfile)
    header = next(csvreader)
    # header = ["_".join(e.lower().strip().split()) for e in header]
    for row in csvreader:
        rowdict = dict(zip(header,row))
        
        # Convert empty strings to null values
        for k in rowdict.keys():
            if rowdict[k].strip() == '':
                rowdict[k] = None
                
        data_facilities.append(rowdict)

In [5]:
db = dataset.connect("sqlite:///sams_data_phase11.sqlite")

In [6]:
tab_facilities = db['facilities']

In [7]:
for rec in data_facilities:
    tab_facilities.insert(rec)

Now code those facilities based on the locations table. Code from file 09.

In [8]:
tab_locations = db['locations']

location_lookup = {
    "country":{},
    "governorate":{},
    "district":{},
    "subdistrict":{}
}
locs = [l for l in tab_locations.find()]

for loc in locs:
    location_lookup[loc['level']][loc['location_name']] = loc['id']
    
updated_facilities = []
    
for rec in tab_facilities.find():
    country = rec['country']
    governorate = rec['governorate']
    district = rec['district']
    subdistrict = rec['subdistrict']
    rec["country_id"] = location_lookup["country"][country]
    rec["governorate_id"] = location_lookup["governorate"][governorate]
    rec["district_id"] = location_lookup["district"][district]
    rec["subdistrict_id"] = location_lookup["subdistrict"][subdistrict]
    rec["aleppo"] = 0
    if governorate == "Aleppo":
        rec["aleppo"] = 1
    
    updated_facilities.append(rec)
    
for rec in updated_facilities:
    tab_facilities.update(rec,['id'])

Replace the `facility_id` values in the `file` table with the updated data from Ranya.

In [9]:
data_files = []

with open(updated_files_file,'r',encoding='utf-8-sig') as csvfile:
    csvreader = csv.reader(csvfile)
    header = next(csvreader)
    for row in csvreader:
        rowdict = dict(zip(header,row))
        
        # Convert empty strings to null values
        for k in rowdict.keys():
            if rowdict[k].strip() == '':
                rowdict[k] = None
                
        data_files.append(rowdict)

Update the records in the `files` table to include the updated `facility_id` values.

In [10]:
tab_files = db['files']

In [11]:
for rec in data_files:
    tab_files.update(rec,['id'])

In `files`, all `facility_id` instances equal to 70 need to be set to 309 instead. The type of `310` needs to be set to Hospital.

In [12]:
update_files = []

for rec in tab_files.find():
    if rec['facility_id'] == 70:
        rec['facility_id'] = 309
        update_files.append(rec)
        
for update in update_files:
    tab_files.update(update,['id'])

In [13]:
# Set the type of 310 to Hospital

row = tab_facilities.find_one(id=310)
row['facility_type'] = "Hospital"
tab_facilities.update(row,['id'])

1

In [14]:
# Delete the duplicate facility
tab_facilities.delete(id=70)

True

Recreate the facilities_timeline table. Mosty copy of code from notebook 10.

In [15]:
ongoing_date = arrow.get('2017-06-30','YYYY-MM-DD')

ongoing_date_month = int(ongoing_date.format("MM"))
ongoing_date_year  = int(ongoing_date.format("YYYY"))

print(ongoing_date_month,ongoing_date_year)

6 2017


In [16]:
def records_for_range(open_date,close_date,facility_id):
    """
    Given two dates that form a range, return records
    for insertion in the database that span the date
    range.
    """
    records = []
    active_date = arrow.get(open_date.ceil('month').date())
    close_date = arrow.get(close_date.ceil('month').date())
    
    while active_date <= close_date:
        active_year = int(active_date.format("YYYY"))
        active_month = int(active_date.format("MM"))
        rec = {"facility_id":facility_id, "year":active_year,"month":active_month}
        records.append(rec)
        active_date = active_date.shift(months=+1)
        
    return records

In [17]:
def extract_timeline_from_facility(facility_record):
    open1 = facility_record['dateopened']
    close1 = facility_record['dateclosed']
    open2 = facility_record['reopened1']
    close2 = facility_record['closed1']
    open3 = facility_record['reopened2']
    close3 = facility_record['closed2']
    
    openstatus = facility_record['openstatus']
    
    facility_id = facility_record['id']
    
    # There are no dates and the facility is closed
    if not open1 and openstatus == "0":
        return []
    
    facility_records = []
    
    if open1:
        # Case: facility has an open1 date
        open1 = arrow.get(open1,"YYYY-MM-DD")
        
        # Find the close date
        if close1 == "ongoing" or close1 is None:
            # Case: facility is still open
            facility_records = records_for_range(open1,ongoing_date,facility_id)
        else:
            close1 = arrow.get(close1,"YYYY-MM-DD")
            facility_records = records_for_range(open1,close1,facility_id)
            
    else:
        if openstatus == "1":
            # Case: facility has no dates but openstatus is 1
            
            rec = {"facility_id":facility_id,"year":ongoing_date_year,"month":ongoing_date_month}
            facility_records.append(rec)
            return facility_records
        else:
            # Case: facility has no dates but and openstatus is 0
            return []
        
    # TODO: Check that open2 is not ongoing
    if open2:
        open2 = arrow.get(open2,"YYYY-MM-DD")
        
        if close2 == "ongoing" or close2 is None:
            facility_records += records_for_range(open2,ongoing_date,facility_id)
        else:
            close2 = arrow.get(close2,"YYYY-MM-DD")
            facility_records += records_for_range(open2,close2,facility_id)
            
    if open3:
        open3 = arrow.get(open3,"YYYY-MM-DD")
        
        if close3 == "ongoing" or close3 is None:
            facility_records += records_for_range(open3,ongoing_date,facility_id)
        else:
            close3 = arrow.get(close3,"YYYY-MM-DD")
            facility_records += records_for_range(open3,close3,facility_id)
            
    return facility_records

In [18]:
facilities = [x for x in tab_facilities.find()]

In [19]:
# Get all the records that need to be put into the timeline table. 
all_facility_recs = []
for f in facilities:
    all_facility_recs += extract_timeline_from_facility(f)

In [20]:
try:
    tab_timeline.drop()
except:
    pass

tab_timeline = db['facility_timeline']

tab_timeline.insert_many(all_facility_recs)

1928635 records in the full_raw_scrubbed table, without filtering.

```
SELECT COUNT(*) FROM full_raw_scrubbed
WHERE a_file_id IN (
	SELECT files.id
	FROM files 
	JOIN facilities
	ON files.facility_id = facilities.id
	WHERE facilities.aleppo = 1
);
```

23470 records in the full_raw_scrubbed table when filtering to Aleppo facilities.

Create `aleppo_arabic_values` table. Code from notebook 6.

In [21]:
tab_raw = db['full_raw_scrubbed']
tab_arabic = db['aleppo_arabic_values']
tab_vars = db['variables']

column_names = db.query("SELECT DISTINCT(normalized) FROM variables;")
column_names = sorted([r['normalized'] for r in column_names])

# We don't want to work with the values in the fields that have been hashed,
# so remove them from the list of variables to query.
fields = [
    "info_name",
    "info_name_author",
    "info_name_caregiver",
    "info_name_facility",
    "info_name_group",
    "info_name_of_coach",
    "info_name_processor",
    "info_name_surgeon",
    "info_phone_skype",
    "date",
    "date_first_exam",
    "death_date"
]
column_names = [e for e in column_names if e not in fields]

In [22]:
# Iterate through the column names, get the distinct values, check
# if they can be cast to floats. If  not, then translate them
# The brackets are needed in the SQL query because some of the column
# names are the same as SQL reserved words. The brackets tell the database
# to look for a column with that name instead of interpreting it as
# the reserved word. 'case' is an example here.


tab_arabic.delete()
    
for col in column_names:
    col_values = db.query("""
        SELECT DISTINCT([""" + col + """]) 
        FROM full_raw_scrubbed 
        WHERE [""" + col + """] IS NOT NULL
        AND [""" + col + """] <> '.'
        AND [""" + col + """] <> ''
        AND a_file_id IN (
            SELECT files.id
            FROM files 
            JOIN facilities
            ON files.facility_id = facilities.id
            WHERE facilities.aleppo = 1
        );
        """)
    col_values = [r[col] for r in col_values]

    # Create a table of unique Arabic values
    for v in col_values:
        # Skip numbers
        if v.replace(",",".").replace('.','',1).isdigit():
            continue
        else:
            r = {"arabic":v,"google_translate":None,"human_translate":None,"normalized":None}
            tab_arabic.upsert(r,['arabic'])

Update the `aleppo_arabic_values` table so that we know which columns the values appear in.

In [23]:
# Create the references

arabic_reference = {}

for row in tab_arabic.find():
    arabic_reference[row['arabic']] = {'id':row['id'],'cols':set()}
    
# This takes a while to run

for col in column_names:
    col_values = db.query("""
        SELECT DISTINCT([""" + col + """]) 
        FROM full_raw_scrubbed 
        WHERE [""" + col + """] IS NOT NULL
        AND [""" + col + """] <> '.'
        AND [""" + col + """] <> ''
        AND a_file_id IN (
            SELECT files.id
            FROM files 
            JOIN facilities
            ON files.facility_id = facilities.id
            WHERE facilities.aleppo = 1
        );
        """)
    col_values = [r[col] for r in col_values]
    
    # For each value in a given column, make sure our reference dict
    # knows that the arabic_values row is referenced by that column
    for arabic_value in col_values:
        try:
            arabic_reference[arabic_value]['cols'].add(col)
        except:
            # KeyError means we removed the value from arabic_values because
            # it was not Arabic and did not require translation to English
            pass
        
# Now take the data we have about which columns contain the relevant
# values and upsert it into the arabic_values table

for row_arabic in arabic_reference.keys():
    row_data = arabic_reference[row_arabic]
    data = {'id':row_data['id'],'appears_in':str(sorted(list(row_data['cols'])))}
    tab_arabic.upsert(data,['id'])

In [24]:
# Create a lookup for the full set of Arabic Values

tab_arabic_orig = db['arabic_values']

ar_lookup = {r['arabic']:{"google":r['google_translate'],"human":r['human_translate']} for r in tab_arabic_orig.find()}

In [25]:
aleppo_updates = []
for row in tab_arabic.find():
    update_row = {"id":row['id']}
    try:
        update_row['google_translate'] = ar_lookup[row['arabic']]['google']
        update_row['human_translate'] = ar_lookup[row['arabic']]['human']
        aleppo_updates.append(update_row)
    except:
        pass
    
for row in aleppo_updates:
    tab_arabic.update(row,['id'])
    

There appear to be some newline characters in the table, so remove them.

In [26]:
aleppo_updates = []

for row in tab_arabic.find():
    if row['human_translate']:
        row['human_translate'] = row['human_translate'].replace("\n"," ").replace("\t"," ").strip()
    
    if row['google_translate']:
        row['google_translate'] = row['google_translate'].replace("\n"," ").replace("\t"," ").strip()
    
    if row['arabic']:
        row['arabic'] = row['arabic'].replace("\n"," ").replace("\t"," ").strip()
    
    aleppo_updates.append(row)
    

In [27]:
for row in aleppo_updates:
    tab_arabic.update(row,['id'])

In [28]:
# Copy the database over as the template for the next file.

# Do not rerun this cell!
shutil.copy2('sams_data_phase11.sqlite','sams_data_phase12_template.sqlite')

'sams_data_phase12_template.sqlite'