In [1]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# Regular expressions
import re

# Display errors in realtime
import ipywidgets as widgets

# work with dates
import datetime
import arrow
import time

# For scrubbing PII
import hashlib

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Import Pandas for easy importing of an excel file
import csv

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase14.sqlite")
    print("Removed template clone sams_data_phase14.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase14_template.sqlite","sams_data_phase14.sqlite")
    
    print("Created database from template: sams_data_phase14.sqlite")
except:
    pass

Removed template clone sams_data_phase14.sqlite
Created database from template: sams_data_phase14.sqlite


In [3]:
# Translation file from Houssom

translation_file = "split_arabic_phrases_01-DEC_2017.csv"
data = []
with open(translation_file,'r', encoding='utf-8') as csvfile:
    csvreader = csv.reader(csvfile)
    header = next(csvreader)
    header[0] = 'id'
    for row in csvreader:
        rowdict = zip(header,row)
        data.append(dict(rowdict))

In [4]:
# Convert some values to integers and use None for empty strings to save database space

clean_data = []
for r in data:
    try:
        r['id'] = int(r['id'])
        r['occurrences'] = int(r['occurrences'])
        for key in r.keys():
            try:
                if r[key].strip() == '':
                    r[key] = None
            except:
                pass
        try:
            del r['Number of Characters']
        except:
            pass
        clean_data.append(r)
    except:
        pass

In [5]:
db = dataset.connect("sqlite:///sams_data_phase14.sqlite")

In [6]:
# Replace the arabic_tokens table with the translation data, which includes
# all of the original arabic_tokens data along with additional metadata from Excel
tab_arabic_tokens = db['arabic_tokens']
tab_arabic_tokens.drop()

True

In [7]:
tab_arabic_tokens = db['arabic_tokens']
tab_arabic_tokens.insert_many(clean_data)

After a review of the data, the best approach here is probably to take the terms that were split out of the original Arabic Values and join them back together, comma separated, into the "Human Translation" column of the Arabic Values table. Given that those map back to the raw data, it will give an approximation of how well we can provide a human translated glimpse of the raw source.

In [8]:
tab_arabic_values_tokens = db['arabic_values_tokens']

In [9]:
join_lookup = {}

for record in tab_arabic_values_tokens.find():
    if record['arabic_values_id'] not in join_lookup.keys():
        join_lookup[record['arabic_values_id']] = set()
    join_lookup[record['arabic_values_id']].add(record['token_id'])

In [10]:
# There are errant commas in the import file and this attempts to remove them,
# though it is possible is also is removing commas that belong. 
token_lookup = {}

for record in tab_arabic_tokens.find():
    translation = record['translation']
    
    # Nothing to join back
    if translation is None:
        continue
    else:
        translation = [x.lower().strip() for x in translation.split(",") if x.strip() != '']
        
    token_lookup[record['id']] = translation

In [11]:
ar_values_to_update = []
for key in join_lookup.keys():
    update_rec = {'id':key}
    all_vals = []
    for token_rec in sorted(list(join_lookup[key])):
        try:
            all_vals += token_lookup[token_rec]
        except:
            pass
    if len(all_vals) == 0:
        continue
    
    hum_trans = ", ".join(all_vals)
    update_rec['human_translate'] = hum_trans
    ar_values_to_update.append(update_rec)

In [12]:
tab_arabic_values = db['arabic_values']

In [16]:
# Peek at our update records:
ar_values_to_update[1030]

{'human_translate': 'urinalysis test, cbc, esr, laboratory tests, vitamin b complex, mebeverine, alt',
 'id': 167781}

In [17]:
for rec in ar_values_to_update:
    tab_arabic_values.update(rec,['id'])

During this process, I noticed that some `col_none` values represent duplicate files and data that was not deidentified. I'm awaiting word of what steps I should take to handle the errors. 

Files with ids: 769 and 747 are duplicates. 

In [2]:
# This represents an attempt to detect duplicate files, but due to slight differences
# in the files, it doesn't see them as duplicates. Hence, the code was not used, but is here
# for future reference.

# https://docs.python.org/3/library/filecmp.html
# import filecmp
# f1 = r"data\turkey\2015\Feb\Bab Al Hawa\Patient log.xlsx"
# f2 = r"data\turkey\2015\Jan\Bab Al Hawa DU\Patient log.xlsx"

# filecmp.cmp(f1,f2,shallow=False)

# import os
# os.stat(f1)
# os.stat(f2)

Remove data from file with id 747 because it is duplicate of 769. The data for 769 was recorded for January and the data for 747 was recorded for February of the same year, so it makes sense to get rid of the more recent duplicates.

In [18]:
# Update the files table to mark the file as skipped
update_rec = {'id':747, 'ignore':1, 'info':"duplicate of file id 769"}
tab_files = db['files']
tab_files.update(update_rec,['id'])

1

In [20]:
# There are other 'dangling' pieces of data from this duplicate file, but they only should 
# affect the raw data tables. The rest is metadata.
db.query("""
DELETE FROM full_raw_scrubbed WHERE a_file_id = 747;
""")

<dataset.persistence.util.ResultIter at 0x150a0281f98>

In [21]:
db.query("""
DELETE FROM full_raw_english WHERE a_file_id = 747;
""")

<dataset.persistence.util.ResultIter at 0x150a0270390>

In [22]:
# Next: obscure the `col_none` values in the full data for those with PII - they weren't caught the first time.

# Don't save in a source code repository
salt = 'REDACTED'.encode()

# Arabic 
tab_raw_ar = db['full_raw_scrubbed']
update_records_ar = []

for rec in tab_raw_ar.find(a_file_id=769):
    new_rec = {'id':rec['id'], 'col_none':rec['col_none']}
    
    h = hashlib.sha256()
    h.update(new_rec['col_none'].encode())
    h.update(salt)
    new_rec['col_none'] = h.hexdigest()
    update_records_ar.append(new_rec)
    
for update_rec in update_records_ar:
    tab_raw_ar.update(update_rec,['id'])

In [23]:
# We have to apply the hash values from the Arabic table to the English table for consistency
tab_raw_en = db['full_raw_english']

for update_rec in update_records_ar:
    tab_raw_en.update(update_rec, ['id'])

In [24]:
# Copy the database over as the template for the next file.
# This Notebook did not include manual editing of the data.

# Do not rerun this cell!
shutil.copy2('sams_data_phase14.sqlite','sams_data_phase15_template.sqlite')

'sams_data_phase15_template.sqlite'