In [4]:
# Manipulate the file system
import os
import shutil
import datetime
import arrow

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [None]:
# If there's an existing db for this sheet, delete it
# so that we can copy from the template for a fresh start

# try:
#     os.remove("sams_data_phase03.sqlite")
#     print("Removed template clone sams_data_phase03.sqlite")
# except:
#     pass

# try:
#     # Try to preserve a copy in case there is a problem and it has to be restored
#     shutil.copy2("sams_data_phase03_template.sqlite","sams_data_phase03.sqlite")
    
#     print("Created database from template: sams_data_phase03.sqlite")
# except:
#     pass

In [5]:
db = dataset.connect("sqlite:///sams_data_phase03.sqlite")

The database created from the phase03 template contains a lot of manual changes to fix incorrect header references across a variety of files. The first task with this copy of the database continues the manual work, using SQLite Manager in Firefox. 

----

The procedure here is to fix header references or mark files as `ignore` in the `files` table. Delete all of the variables, recreate by iterating through the files, then check again for problems.

The code to delete and recheck is all in the following cell. Note that it may take a while to run, but you should be able to run it as many times as possible as the database it updates to fix header references.

In [3]:
tab_files = db['files']
tab_files_sheets = db['files_sheets']
tab_vars = db['variables']
tab_files_sheets_vars = db['files_sheets_vars']
tab_files_vars = db['files_variables']
tab_sheets_vars = db['sheets_variables']

tab_vars.drop()
tab_files_sheets_vars.drop()
tab_files_vars.drop()
tab_sheets_vars.drop()

recs_to_process = db.query("""
SELECT files_sheets.id AS files_sheets_id, 
       files_sheets.file_id, 
       files_sheets.sheet_id, 
       files_sheets.header_start, 
       files_sheets.header_end,
       files.path AS file_path,
       sheets.name AS sheet_name
FROM files_sheets
JOIN files ON files_sheets.file_id = files.id
JOIN sheets ON files_sheets.sheet_id = sheets.id
WHERE sheets.skip = 0
AND files.ignore = 0
ORDER BY file_id, sheet_id;
""")

letter_lookup = ['A','B','C','D','E','F','G','H','I','J','K',
                 'L','M','N','O','P','Q','R','S','T','U','V',
                 'W','X','Y','Z']

variables = set()

In [4]:
working_file_id = -1
active_file_path = None
active_workbook = None

for rec in recs_to_process:
    
    # This only fires with a new file_id
    if rec['file_id'] > working_file_id:
        working_file_id = rec['file_id']
        active_file_path = rec['file_path']
        try:
            active_workbook = openpyxl.load_workbook(active_file_path,read_only=True,guess_types=False,data_only=True)
        except:
            print("Unable to open",active_file_path)
            active_workbook = None
            active_file_path = None
            working_file_id = -1
            continue
            
    # Process the active file
    sheet_name = rec['sheet_name']
    # header_range, header_data = headers_from_worksheet(active_workbook,sheet_name)
    header_start = rec['header_start']
    header_end = rec['header_end']
    
    # Unable to find a header in this sheet. Mark the record
    if header_start is None or header_end is None or "PROBLEM" in header_start or "PROBLEM" in header_end:
        continue
    else:
        
        worksheet = active_workbook.get_sheet_by_name(sheet_name)

        header_cells = worksheet[header_start:header_end]
        header_data = [c.value for c in header_cells[0]]
        
        fixed_header_data = []
        for value in header_data:
            if isinstance(value,datetime.datetime):
                fixed_value = arrow.get(value).format("YYYY-MM-DD")
                fixed_header_data.append(fixed_value)
                variables.add(fixed_value)
            else:
                fixed_header_data.append(value)
                variables.add(value)
        
        update_rec = {
            "id":rec['files_sheets_id'],
            "header_start":header_start,
            "header_end":header_end,
            "header_values":str(fixed_header_data)
        }
        
        tab_files_sheets.update(update_rec,['id'])

In [5]:
translate_client = translate.Client()
target_lang = 'en'

try:
    tab_vars.drop()
except:
    pass

tab_vars = db['variables']

for v in variables:
    try:
        v_str = str(v)
        clean = v_str.replace("\n"," ").replace("\\","").replace("\t"," ").strip()
    except:
        print("Could not process",v)
        continue
        
    translation = translate_client.translate(v_str,target_language=target_lang)
        
    rec = {
        "orig":v,
        "clean":clean,
        "translation":translation['translatedText'],
        "normalized":""
    }
    try:
        tab_vars.insert(rec)
    except:
        print("\nFailure to insert")
        print(rec)
        
try:
    tab_files_vars.drop()
    tab_sheets_vars.drop()
    tab_files_sheets_vars.drop()
except:
    pass

tab_files_vars = db['files_variables']
tab_sheets_vars = db['sheets_variables']
tab_files_sheets_vars = db['files_sheets_vars']

var_lookup = {}
for rec in tab_vars.find():
    var_lookup[str(rec['orig'])] = rec['id'] # This might cause a problem casting to str
    
files_vars_set = set()
sheets_vars_set = set()
files_sheets_vars_set = set()


# Below need to check if file is being skipped before processing header

for rec in tab_files_sheets.find():
    header_start = rec['header_start']
    header_end = rec['header_end']
    
    file_id = rec['file_id']
    
    file_rec = tab_files.find_one(id=file_id)
    if file_rec['skipped'] is True or file_rec['ignore'] is True:
        continue
    
    if header_start is None or header_end is None:
        continue
    
    if '(PROBLEM)' in header_start or '(PROBLEM)' in header_end:
        continue
    
    header_values = rec['header_values']
    if header_values is None:
        continue
    
    header_values = ast.literal_eval(rec['header_values'])
    
    for header in header_values:
        if header is None:
            continue
            
        file_id = rec['file_id']
        sheet_id = rec['sheet_id']
        files_sheets_id = rec['id']
        
        try:
            lu = str(header)
            var_id = var_lookup[lu]
        except:
            try:
                var_id = var_lookup[header]
            except:
                print("Unable to find",header)
                continue
        
        files_vars_set.add((file_id,var_id))
        files_sheets_vars_set.add((files_sheets_id,var_id))
        sheets_vars_set.add((sheet_id,var_id))
        
tab_files_vars_recs = []

for rec_tuple in files_vars_set:
    rec = {"file_id":rec_tuple[0],"var_id":rec_tuple[1]}
    tab_files_vars_recs.append(rec)
    
tab_files_vars.insert_many(tab_files_vars_recs)

tab_sheets_vars_recs = []

for rec_tuple in sheets_vars_set:
    rec = {"sheet_id":rec_tuple[0],"var_id":rec_tuple[1]}
    tab_sheets_vars_recs.append(rec)
    
tab_sheets_vars.insert_many(tab_sheets_vars_recs)

tab_files_sheets_vars_recs = []

for rec_tuple in files_sheets_vars_set:
    rec = {"files_sheets_id":rec_tuple[0],"var_id":rec_tuple[1]}
    tab_files_sheets_vars_recs.append(rec)
    
tab_files_sheets_vars.insert_many(tab_files_sheets_vars_recs)

# There are sheets that have header references but are blank.
# blank them out in the database.
recs = tab_files_sheets.find()

update_recs = []

for rec in recs:
    if rec['header_values'] == '[]':
        ur = {
            "id":rec['id'],
            "header_start":None,
            "header_end":None,
            "header_values":None
        }
        update_recs.append(ur)
        
for rec in update_recs:
    tab_files_sheets.update(rec,['id'])

Clear `header_values` from `files_sheets` before running the block above.


AT THIS POINT... I'm ready to do manual variable consolidation. That's the final step prior to exporting the data from sheets into the database, into the consolidated variables. There are currently 355 variables identified across the files. Need to consolidate them into as few as possible. Note that it's important to consider whether they all are "patient log" variables or how many of them are "pharmacy log" variables, etc. If there is a variable that has presence in multiple tables, should it be prefixed with `pl_` for patient log or something like that? How to handle the case when two variables are consolidated into the same field but appear in the same record? Probably best to just comma separate the values and put them in the same column. It will be important to track where those appear because sometimes they likely will need to be broken out into multiple columns.

Now that the manual work is done to try to give sane names to the variables, it's time to save the database out as a template for the next notebook.

In [6]:
# Do not rerun this cell!
# shutil.copy2('sams_data_phase03.sqlite','sams_data_phase04_template.sqlite')

'sams_data_phase04_template.sqlite'