### Important Setup Note

Prior to this notebook, the variables were normalized to provide a structure for importing data from all of the sheets. The consequence of that is that if the variable reduction mapping has improperly combined variables or the reduction is incomplete, then the template for this level will need to be changed to accommodate the new variable mapping. 

In [1]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# work with dates
import datetime
import arrow

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase04.sqlite")
    print("Removed template clone sams_data_phase04.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase04_template.sqlite","sams_data_phase04.sqlite")
    
    print("Created database from template: sams_data_phase04.sqlite")
except:
    pass

Removed template clone sams_data_phase04.sqlite
Created database from template: sams_data_phase04.sqlite


In [3]:
db = dataset.connect("sqlite:///sams_data_phase04.sqlite")

The first attempt to import the data will be into a single table. If it becomes necessary to break the data out into multiple tables, then that work will be done after evaluating the original import.

To create the table without causing problems with the insert -- since the bulk insert inevitably is orders of magnitude faster, then we'll create the fields manually by iterating through all of the normalized names in the variables table.

In [4]:
tab_all = db['full_raw_data']
tab_vars = db['variables']
tab_files_sheets = db['files_sheets']

In [5]:
# Get a list of variables
recs = db.query("SELECT DISTINCT(normalized) AS var FROM variables;")
variables = sorted([r['var'] for r in recs])

In [6]:
# Create a few common variables
tab_all.create_column("a_file_id",sqlalchemy.Integer)
tab_all.create_column("a_files_sheets_id",sqlalchemy.Integer)
tab_all.create_column("a_sheet_id",sqlalchemy.Integer)

for v in variables:
    tab_all.create_column(v,sqlalchemy.String)

The table is ready for the data.

In [7]:
# Create an in-memory lookup table for variables
var_lookup = {}
for r in tab_vars.find():
    var_lookup[r['orig']] = r['normalized']

In [8]:
sheets_to_process = db.query("""
SELECT files_sheets.id AS files_sheets_id, 
       files_sheets.file_id, 
       files_sheets.sheet_id, 
       files_sheets.header_start, 
       files_sheets.header_end,
       files_sheets.header_values,
       files.path AS file_path,
       sheets.name AS sheet_name
FROM files_sheets
JOIN files ON files_sheets.file_id = files.id
JOIN sheets ON files_sheets.sheet_id = sheets.id
WHERE sheets.skip = 0

AND files.ignore = 0
AND files_sheets.header_start IS NOT NULL
AND files_sheets.header_start <> 'PROBLEM'
AND files_sheets.header_end NOT LIKE '%PROBLEM%'
ORDER BY file_id, files_sheets_id;
""")

In [9]:
# Only reopen files when necessary
working_file_id = -1
active_file_path = None
active_workbook = None


for rec in sheets_to_process:
    import_status = ""
    
    if rec['file_id'] > working_file_id:
        working_file_id = rec['file_id']
        active_file_path = rec['file_path']
        try:
            active_workbook = openpyxl.load_workbook(active_file_path,read_only=True,guess_types=False,data_only=True)
        except:
            import_status = "Unable to open file"
            import_status = "imported"
            tab_files_sheets.update({"id":rec['files_sheets_id'],"import_status":import_status},["id"])
            print("Unable to open",active_file_path)
            
            active_workbook = None
            active_file_path = None
            working_file_id = -1
            continue
            
    # Process the active file
    sheet_name = rec['sheet_name']
    header_start = rec['header_start']
    header_end = rec['header_end']
    
    # Unable to find a header in this sheet. Mark the record
    if header_start is None or header_end is None or "PROBLEM" in header_start or "PROBLEM" in header_end:
        import_status = "skipped"
        continue
    else:
        worksheet = active_workbook.get_sheet_by_name(sheet_name)
        last_row = worksheet.max_row
        
        # Sometimes worksheet.max_row doesn't return a value
        if last_row == None:
            import_status = "imported: last row None"
            last_row = 10000
        
        data_start = header_start[0] + str(int(header_start[1:])+1)
        data_end = header_end[0] + str(last_row)
        data_range_string = data_start + ":" + data_end
        
        # These are stored as a list converted to a string. Convert back to a list for enumeration
        header_values = ast.literal_eval(rec['header_values'])
        sheet_data = []
        
        try:
            for datarow in worksheet[data_range_string]:
                record = {}
                for idx,cell in enumerate(datarow):
                    header_val = header_values[idx]

                    # Get the normalized value
                    header_val = var_lookup[header_val]

                    cell_value = cell.value                

                    try:
                        cell_value = cell_value.strip()
                    except:
                        pass

                    # Cannot write datetime objects to the database
                    # Unless they are first converted to strings

                    if isinstance(cell_value,datetime.datetime):
                        cell_value = openpyxl.utils.datetime.datetime_to_W3CDTF(cell_value)
                    elif isinstance(cell_value,datetime.time):
                        cell_value = str(cell_value)
                    elif cell_value is not None:
                        cell_value = str(cell_value)

                    # There's already a value in the field and it should be a string.
                    # If it is our representation of none, replace it
                    if header_val in record.keys():
                        if record[header_val] == '.' or record[header_val] is None:
                            record[header_val] = cell_value
                        elif cell_value is not None:
                            record[header_val] = record[header_val] + ", " + cell_value
                    else:

                        # Blank strings instead of NULL will help us know which fields were available for the record
                        if cell_value is None:
                            record[header_val] = '.'
                        else:
                            record[header_val] = cell_value

                sheet_data.append(record)

            # Remove from sheet_data blank rows
            rich_sheet_data = []
            for staged in sheet_data:
                working_copy = copy.deepcopy(staged)
                try:
                    # Put "passover columns" here. They will be removed from the record
                    # before it is evaluated as "empty." Number is a good example because there are sheets
                    # where somebody dragged numbers down a column in preparation for a lot of data but 
                    # never actually used all of the numbered rows
                    del working_copy['number']
                except:
                    pass

                if all((x == None or x == '.' or x == '') for x in list(working_copy.values())):
                    continue
                else:
                    staged['a_file_id'] = rec['file_id']
                    staged['a_files_sheets_id'] = rec['files_sheets_id']
                    staged['a_sheet_id'] = rec['sheet_id']

                    rich_sheet_data.append(staged)

            # Try to perform a bulk insert
            tab_all.insert_many(rich_sheet_data)
            
            # Update the status of the worksheet
            import_status = "imported"
            tab_files_sheets.update({"id":rec['files_sheets_id'],"import_status":import_status},["id"])
            
        except:
            print("\n--------------------------------------------------------------------------")
            print("Failure")
            print("file_id",rec['file_id'],"files_sheets_id",rec['files_sheets_id'],"sheet_id",rec['sheet_id'])
            print(active_file_path,sheet_name)
            print("Header range:",header_start,header_end)
            print("Data range:",data_start,data_end)
            print("data_range_string",data_range_string)
            
            tab_files_sheets.update({"id":rec['files_sheets_id'],"import_status":import_status},["id"])

At this point, the first ETL pass is complete. Save the database out as a template and move to notebook 5 for additional processing.

In [10]:
# Do not rerun this cell!
# shutil.copy2('sams_data_phase04.sqlite','sams_data_phase05_template.sqlite')

'sams_data_phase05_template.sqlite'