In [1]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# Regular expressions
import re

# Display errors in realtime
import ipywidgets as widgets

# work with dates
import datetime
import arrow
import time

# For scrubbing PII
import hashlib

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Import Pandas for easy importing of an excel file
import csv

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase09.sqlite")
    print("Removed template clone sams_data_phase09.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase09_template.sqlite","sams_data_phase09.sqlite")
    
    print("Created database from template: sams_data_phase09.sqlite")
except:
    pass

Removed template clone sams_data_phase09.sqlite
Created database from template: sams_data_phase09.sqlite


In [3]:
db = dataset.connect("sqlite:///sams_data_phase09.sqlite")

In [4]:
# Files with the facilities data to reconcile
path_uossm = "facilities_uossm.csv"
path_sams = "facilities_sams.csv"

Import the facilities data from both files, reconcile it in memory, then create the appropriate tables in the database.

In [5]:
def extract_from_csv(csvfile,org_name):
    """
    Clean the incoming facility data and structure it for push into database
    """
    data_holder = []
    csvreader = csv.reader(csvfile)
    header = next(csvreader)
    header = ["_".join(e.lower().strip().split()) for e in header]
    for row in csvreader:
        #row_fixed = [e.lower().strip() for e in row]
        rowdict = dict(zip(header,row))
        # Convert empty strings to null values
        for k in rowdict.keys():
            if rowdict[k].strip() == '':
                rowdict[k] = None
                
        # Try to convert dates to the standard format of YYYY-MM-DD
        for k in rowdict.keys():
            if ('open' in k or 'close' in k or 'date' in k) and rowdict[k] and 'status' not in k and rowdict[k] != 'ongoing':
                try:
                    d = arrow.get(rowdict[k],'D/M/YYYY').format("YYYY-MM-DD")
                    rowdict[k] = d
                except:
                    pass
        
        rowdict['organization'] = org_name
        data_holder.append(rowdict)
    return data_holder

In [6]:
# Read in the SAMS facility data
data_sams = []

with open(path_sams,'r',encoding='utf-8') as csvfile:
    data_sams = extract_from_csv(csvfile,"SAMS")

In [7]:
# Read in the UOSSM facility data
data_uossm = []

with open(path_uossm,'r',encoding='utf-8') as csvfile:
    data_uossm = extract_from_csv(csvfile,"UOSSM")

In [8]:
# Combine the data sources
data = data_sams + data_uossm

Since I'm unaware of how we may shift our geo tracking of these facilities, I'm creating a simple self-referencing locations table to create a graph of locations.

Currently have: `country -> governorate -> district -> subdistrict`

In [9]:
tab_locations = db['locations']

In [10]:
# This could be done more efficiently, but I'm being verbose due to lack of time
countries = set([r['country'] for r in data])
countries = [{"location_name":c,"level":"country","parent_id":None} for c in countries]

governorates = set([r['governorate'] for r in data])
governorates = [{"location_name":c,"level":"governorate","parent_id":None} for c in governorates]

districts = set([r['district'] for r in data])
districts = [{"location_name":c,"level":"district","parent_id":None} for c in districts]

subdistricts = set([r['subdistrict'] for r in data])
subdistricts = [{"location_name":c,"level":"subdistrict","parent_id":None} for c in subdistricts]

In [11]:
tab_locations.insert_many(countries)
tab_locations.insert_many(governorates)
tab_locations.insert_many(districts)
tab_locations.insert_many(subdistricts)

In [12]:
location_lookup = {
    "country":{},
    "governorate":{},
    "district":{},
    "subdistrict":{}
}
locs = [l for l in tab_locations.find()]

for loc in locs:
    location_lookup[loc['level']][loc['location_name']] = loc['id']

In [13]:
# Actual locations in tuple-graphs
locations = set((d['country'],d['governorate'],d['district'],d['subdistrict']) for d in data)

In [14]:
# Create a reflection of the locations but using their ID numbers.
locations_ids = []
for l in locations:
    country_id = location_lookup["country"][l[0]]
    governorate_id = location_lookup["governorate"][l[1]]
    district_id = location_lookup["district"][l[2]]
    subdistrict_id = location_lookup["subdistrict"][l[3]]
    rec = [country_id,governorate_id,district_id,subdistrict_id]
    locations_ids.append(rec)

In [15]:
# Now update the locations table for the parenting
for place in locations_ids:
    for index in range(3,0,-1):
        loc_rec_to_update = dict(id=place[index],parent_id=place[index-1])
        tab_locations.update(loc_rec_to_update,['id'])

Now that the locations are in the table, we need to create a facilities table and map the locations to the facilities. The tracking of the open/closed status of the facilities needs to eventually be broken out in a more robust manner, but for now, to expedite translation work, the facilities list will be used to reduce the raw file data and export a minified Arabic Values table.

In [16]:
for rec in data:
    country = rec['country']
    governorate = rec['governorate']
    district = rec['district']
    subdistrict = rec['subdistrict']
    rec["country_id"] = location_lookup["country"][country]
    rec["governorate_id"] = location_lookup["governorate"][governorate]
    rec["district_id"] = location_lookup["district"][district]
    rec["subdistrict_id"] = location_lookup["subdistrict"][subdistrict]
    rec["aleppo"] = 0
    if governorate == "Aleppo":
        rec["aleppo"] = 1
        
    # Deduplication of facilities will happen through parenting
    rec["facility_parent_id"] = None

In [17]:
# Insert the records into a facilities table
try:
    tab_facilities.drop()
except:
    pass

tab_facilities = db['facilities']

for rec in data:
    tab_facilities.insert(rec)

Loading in a file to help assign facilities to files. 

In [27]:
f = "facilities_tagged.csv"

tagged = []

with open(f,'r',encoding='utf-8-sig') as csvfile:
    csvreader = csv.reader(csvfile)
    header = next(csvreader)
    print(header)
    for row in csvreader:
        #row_fixed = [e.lower().strip() for e in row]
        rowdict = dict(zip(header,row))
        if rowdict['facility_id'] == '':
            rowdict['facility_id'] = None
        rowdict['path_parts'] = rowdict['file_path'].split("\\")
        tagged.append(rowdict)    

['id', 'file_name', 'ungarbled', 'translation', 'facility_id', 'possible_problem', 'file_path']


In [28]:
site_dict = {}

for r in tagged:
    if r['facility_id']:
        site_dict[r['path_parts'][4]] = r['facility_id']

In [32]:
for row in tagged:
    try:
        f_id = site_dict[row['path_parts'][4]]
        row['facility_id'] = f_id
    except:
        pass

In [34]:
untagged = []
for row in tagged:
    if row['facility_id'] is None:
        untagged.append(row)

In [36]:
to_find = set()
for row in untagged:
    to_find.add(row['path_parts'][4])

In [41]:
hydrate = {
 'Al Mlolah PHC': None,
 'Al Salam FH': '117',
 'AlMlolah PHC': None,
 'AlSakhour Hospital': None,
 'Albatraneh': None,
 'Albatraneh PHC': None,
 'Almlolah': None,
 'Almlolah PHC': None,
 'Alsalam Almaara': '115',
 'Bab Al Hawa': '125',
 'Bab Al Hawa DU': '124',
 'Bab Alhwa Dialysis Unit': '124',
 'Dec.xlsx': None,
 'Dental 4.xlsx': None,
 'Dental Rif Hama': None,
 'Dental.xlsx': None,
 'Ejaz Mobile Hospital': None,
 'Ejaz PHC': None,
 'Hama MC': '88',
 'Hama Mc': '88',
 'Hama Mobile Clinic': '88',
 'Hama Mobile clinic': '88',
 'Idleb FH': '127',
 'Idlib FH': '127',
 'Idlib field Hospital': '127',
 'Idlib trama': '127',
 'Ijaz': None,
 'Ijaz MH': None,
 'Ijaz Mobile Hospital': None,
 'Ijaz PHC': None,
 'Jabal Alzawia mobile clinicdental': '129',
 'Jabal Alzawieh': '129',
 'Jisr Al shgur': '130',
 'Kafranbouteh PHC': '92',
 'Kfrnbotheh': '92',
 'M10 Trauma': '13',
 'M3': '21',
 'M3 PHC': '20',
 'Mara Dialysis Center': None,
 'Maree ObGyn': None,
 'Mobile hospital': None,
 'Nov.xlsx': None,
 'Oct.xlsx': None,
 'Osama Albarodi PHC': '94',
 'Patient log 2, Al Rahma FH.xlsx': '274',
 'Patient log, Al Rahma FH.xlsx': '274',
 'Patient log, Al Rahma Fh.xlsx': '274',
 'Patient log.xlsx': None,
 'Physical Rehab.xlsx': None,
 'Physical rehab.xlsx': None,
 'Qastun PHC': '87',
 'Rural Hama MC': '88'}

site_dict = {**site_dict, **hydrate}

In [53]:
for row in tagged:
    row['file_path'] = row['file_path'].replace("\n",'')
    try:
        f_id = site_dict[row['path_parts'][4]]
        row['facility_id'] = f_id
    except:
        pass

In [54]:
for row in tagged:
    row['id'] = int(row['id'])

In [55]:
to_write = [header]

from operator import itemgetter

sorted_output = sorted(tagged, key=itemgetter('id')) 

In [56]:
for row in sorted_output:
    r = [row[k] for k in header if '_parts' not in k]
    to_write.append(r)

In [58]:
with open('hydrated_file_facilities.csv','w',encoding='utf-8',newline='') as outfile:
    writer = csv.writer(outfile)
    for row in to_write:
        writer.writerow(row)

The data was exported and joined back to the export of the files table. This file I'm importing contains the id from `files` and the facility_id from `facilities`. I'll use the values to update the `files` table. 

Note: If corrections are made to the exported table moving forward, the proper process will be to drop and recreate the `files` table with the updated data and would be best done in a separate notebook.

In [66]:
update_data = []
with open('bind_files_to_facilities.csv','r',encoding='utf-8-sig') as csvfile:
    csvreader = csv.reader(csvfile)
    header = next(csvreader)
    print(header)
    for row in csvreader:
        row_fixed = [int(e) for e in row if e != '']
        rowdict = dict(zip(header,row_fixed))
        if 'facility_id' in rowdict.keys():
            update_data.append(rowdict)

['id', 'facility_id']


In [68]:
tab_files = db['files']

In [69]:
for row in update_data:
    tab_files.update(row,['id'])

In [70]:
# Copy the database over as the template for the next file.
# This Notebook did not include manual editing of the data.

# Do not rerun this cell!
shutil.copy2('sams_data_phase09.sqlite','sams_data_phase10_template.sqlite')

'sams_data_phase10_template.sqlite'