With the scrubbed data, this notebook consolidates the Arabic values and dispatches them to the Google Translate API to get an approximate English translation. 

In [1]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# Display errors in realtime
import ipywidgets as widgets

# work with dates
import datetime
import arrow
import time

# For scrubbing PII
import hashlib

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [None]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase06.sqlite")
    print("Removed template clone sams_data_phase06.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase06_template.sqlite","sams_data_phase06.sqlite")
    
    print("Created database from template: sams_data_phase06.sqlite")
except:
    pass

In [2]:
db = dataset.connect("sqlite:///sams_data_phase06.sqlite")

In [3]:
tab_raw = db['full_raw_scrubbed']
tab_arabic = db['arabic_values']
tab_vars = db['variables']

In [15]:
column_names = db.query("SELECT DISTINCT(normalized) FROM variables;")
column_names = sorted([r['normalized'] for r in column_names])

In [16]:
# We don't want to work with the values in the fields that have been hashed,
# so remove them from the list of variables to query.
fields = [
    "info_name",
    "info_name_author",
    "info_name_caregiver",
    "info_name_facility",
    "info_name_group",
    "info_name_of_coach",
    "info_name_processor",
    "info_name_surgeon",
    "info_phone_skype",
    "date",
    "date_first_exam",
    "death_date"
]
column_names = [e for e in column_names if e not in fields]

In [None]:
# Iterate through the column names, get the distinct values, check
# if they can be cast to floats. If  not, then translate them
# The brackets are needed in the SQL query because some of the column
# names are the same as SQL reserved words. The brackets tell the database
# to look for a column with that name instead of interpreting it as
# the reserved word. 'case' is an example here.

tab_arabic.delete()

for col in column_names:
    col_values = db.query("""
        SELECT DISTINCT([""" + col + """]) 
        FROM full_raw_scrubbed 
        WHERE [""" + col + """] IS NOT NULL
        AND [""" + col + """] <> '.'
        AND [""" + col + """] <> '';
        """)
    col_values = [r[col] for r in col_values]

    # Create a table of unique Arabic values
    for v in col_values:
        # Skip numbers
        if v.replace(",",".").replace('.','',1).isdigit():
            continue
        else:
            r = {"arabic":v,"google_translate":None,"human_translate":None,"normalized":None}
            tab_arabic.upsert(r,['arabic'])

In [4]:
# Arabic Character Ranges Regex
# https://stackoverflow.com/questions/4446244/how-to-check-if-any-arabic-character-exists-in-the-string-javascript

translate_client = translate.Client()
target_lang = 'en'

In [5]:
record_counter = widgets.HTML(value="Records: 0",continuous_update=True)
character_counter = widgets.HTML(value="Characters: 0",continuous_update=True)
error_counter = widgets.HTML(value="Errors: 0",continuous_update=True)

In [10]:
character_count = 0
record_count = 0
error_count = 0

for row in tab_arabic.find(google_translate=None):
    if row['google_translate'] is None:
        arabic_string = row['arabic']
        arabic_string.replace("_"," ").replace("\n"," ").replace("  "," ").replace("\t"," ").strip()
        try:
            translation = translate_client.translate(arabic_string, target_language=target_lang)
            rec = {
                "id":row['id'],
                "google_translate":translation['translatedText']
            }

            tab_arabic.update(rec,['id'])

            character_count += len(row['arabic'])
            character_counter.value = "Characters: " + str(character_count)
            record_count += 1
            record_counter.value = "Records: " + str(record_count)
            
        except:
            error_count += 1
            error_counter.value = "Errors: " + str(error_count)
            time.sleep(10)
            
    else:
        continue
        
print("--------------\nDone")
print("Record Count",record_count)
print("Character Count",character_count)

--------------
Done
Record Count 0
Character Count 0


In [6]:
record_counter

In [7]:
character_counter

In [8]:
error_counter

----

There appears to be some PII in the data, so we're going to upsert all of the rows in the `arabic_values` table to track which columns the values appear in. That will allow us to obfuscate those columns before generating the English approximation table.

In [18]:
# Create the references

arabic_reference = {}

for row in tab_arabic.find():
    arabic_reference[row['arabic']] = {'id':row['id'],'cols':set()}

In [19]:
len(arabic_reference.keys())

353237

In [20]:
# This takes a while to run

for col in column_names:
    col_values = db.query("""
        SELECT DISTINCT([""" + col + """]) 
        FROM full_raw_scrubbed 
        WHERE [""" + col + """] IS NOT NULL
        AND [""" + col + """] <> '.'
        AND [""" + col + """] <> '';
        """)
    col_values = [r[col] for r in col_values]
    
    # For each value in a given column, make sure our reference dict
    # knows that the arabic_values row is referenced by that column
    for arabic_value in col_values:
        try:
            arabic_reference[arabic_value]['cols'].add(col)
        except:
            # KeyError means we removed the value from arabic_values because
            # it was not Arabic and did not require translation to English
            pass

In [28]:
arabic_reference[list(arabic_reference.keys())[4]]

{'cols': {'analysis',
  'col_1',
  'col_2',
  'col_3',
  'notes',
  'referral',
  'referral_notes'},
 'id': 5}

In [31]:
# Now take the data we have about which columns contain the relevant
# values and upsert it into the arabic_values table

for row_arabic in arabic_reference.keys():
    row_data = arabic_reference[row_arabic]
    data = {'id':row_data['id'],'appears_in':str(sorted(list(row_data['cols'])))}
    tab_arabic.upsert(data,['id'])

The problem is that the variable named `N` was mapped to the normalized variable of `number` but should have been mapped to the variable `info_name`, so the records in those rows need to be fixed.

In [32]:
broken_records = db.query("""
    SELECT * FROM full_raw_scrubbed WHERE a_files_sheets_id IN (
        SELECT DISTINCT(files_sheets_id) FROM files_sheets_vars WHERE var_id in (343)
    );
""")

In [33]:
broken_records = [r for r in broken_records]

In [34]:
len(broken_records)

27385

In [35]:
# Extract the value from `number`, hash it, store it in `info_name`, then turn `number` into None/NULL
# Do not save this value in a source code repository!
salt = 'REDACTED'.encode()

# h = hashlib.sha256()
# h.update(rec[pii_field].encode())
# h.update(salt)
# rec[pii_field] = h.hexdigest()

for rec in broken_records:
    h = hashlib.sha256()
    h.update(rec['number'].encode())
    h.update(salt)
    rec['info_name'] = h.hexdigest()
    rec['number'] = None

In [38]:
for rec in broken_records:
    tab_raw.update(rec,['id'])

Another scan of the `arabic_values` table shows that `col_null` mostly contains PII - probably patient names. This is a more complicated case because it's not apparent from the bulk context what each of these variables was intended to represent. 

Finding the problems:

```
SELECT full_raw_scrubbed.id,full_raw_scrubbed.a_files_sheets_id, full_raw_scrubbed.col_null, arabic_values.google_translate,full_raw_scrubbed.info_name
FROM full_raw_scrubbed
LEFT OUTER JOIN arabic_values ON full_raw_scrubbed.col_null = arabic_values.arabic
WHERE full_raw_scrubbed.col_null IS NOT NULL
ORDER BY full_raw_scrubbed.id ASC;
```

1. Find records where `col_null` IS NOT NULL and `info_name` IS NULL
2. Delete matching `arabic_values` rows for values in `col_null`
3. Hash `col_null` and move it to `info_name` for those records

Note that these are `files_sheets_id` numbers 6986 and 7414 where `col_null` is `info_name`.


This deletes the `arabic_values` entries:

```
DELETE FROM arabic_values WHERE arabic IN (
    SELECT DISTINCT(col_null) FROM full_raw_scrubbed
    WHERE col_null IS NOT NULL 
    AND info_name IS NULL
);
```

In [41]:
broken_records = db.query("""
    SELECT * FROM full_raw_scrubbed
    WHERE col_null IS NOT NULL 
    AND info_name IS NULL;
""")
broken_records = [r for r in broken_records]
len(broken_records)

753

In [42]:
for rec in broken_records:
    h = hashlib.sha256()
    h.update(rec['col_null'].encode())
    h.update(salt)
    rec['info_name'] = h.hexdigest()
    rec['col_null'] = None
    
for rec in broken_records:
    tab_raw.update(rec,['id'])

----

It appears that most PII is cleaned at this point in time, so create the approximation English table.

In [43]:
tab_full_english = db['full_raw_english']

In [44]:
# Create a reference lookup
arabic_reference = {}

for row in tab_arabic.find():
    arabic_reference[row['arabic']] = row['google_translate']

In [47]:
buffer = []
buffer_max_size = 10000

for row in tab_raw.find():
    new_row = {}
    for key in row.keys():
        row_value = row[key]
        try:
            new_row[key] = arabic_reference[row_value]
        except:
            new_row[key] = row_value
    buffer.append(new_row)
    
    if len(buffer) > buffer_max_size:
        tab_full_english.insert_many(buffer)
        buffer.clear()
        
# Clean up dangling rows
tab_full_english.insert_many(buffer)
buffer.clear()

Note that this is (literally) an expensive notebook to run. The Google Translate costs are probably in the order of $300 or so. Probably best not to run it again.

In [48]:
# Copy the database over as the template for the next file.
# This Notebook did not include manual editing of the data.

# Do not rerun this cell!
# shutil.copy2('sams_data_phase06.sqlite','sams_data_phase07_template.sqlite')

'sams_data_phase07_template.sqlite'