In [2]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# Regular expressions
import re

# Display errors in realtime
import ipywidgets as widgets

# work with dates
import datetime
import arrow
import time

# For scrubbing PII
import hashlib

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Import Pandas for easy importing of an excel file
import csv

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase19.sqlite")
    print("Removed template clone sams_data_phase19.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase19_template.sqlite","sams_data_phase19.sqlite")
    
    print("Created database from template: sams_data_phase19.sqlite")
except:
    pass

Created database from template: sams_data_phase19.sqlite


In [3]:
db = dataset.connect("sqlite:///sams_data_phase19.sqlite")

In [4]:
# Get a reference to the arabic_values table
tab_arabic_values = db['arabic_values']

In [5]:
try:
    db['full_raw_flags'].drop()
    print("Dropped full_raw_flags")
except:
    pass

try:
    db['full_raw_flags_reduced'].drop()
    print("Dropped full_raw_flags_reduced").drop()
except:
    pass

# One SQLite limitation is you cannot drop columns, so you have to create a new table and then rename it.
preserve_fields = [k for k in tab_arabic_values.find_one().keys() if 'flag_' not in k]

# We don't use result but assigning it skips printing some garbage below
result = db.query("""
CREATE TABLE new_arabic_values AS 
    SELECT """ + ",".join(preserve_fields) + """ 
    FROM arabic_values;
""")

# Drop the original arabic_values table
tab_arabic_values.drop()

# Rename new_arabic_values to arabic_values & now we have a table with no flag columns
result = db.query("""
ALTER TABLE new_arabic_values RENAME TO arabic_values;
""")

Dropped full_raw_flags
Dropped full_raw_flags_reduced


In [6]:
# Create a new column in arabic_values by joining the Google translated tokens

# Create a join lookup
tab_arabic_values_tokens = db['arabic_values_tokens']

join_lookup = {}

for record in tab_arabic_values_tokens.find():
    if record['arabic_values_id'] not in join_lookup.keys():
        join_lookup[record['arabic_values_id']] = set()
    join_lookup[record['arabic_values_id']].add(record['token_id'])

In [7]:
# Clean tokens for the lookup

tab_arabic_tokens = db['arabic_tokens']
token_lookup = {}

for record in tab_arabic_tokens.find():
    translation = record['google_translate_feb']
    
    # Nothing to join back
    if translation is None:
        continue
    else:
        translation = [x.lower().strip() for x in translation.split(",") if x.strip() != '']
        
    token_lookup[record['id']] = translation

In [8]:
# Create the list of records we will update in arabic_values with the joined Google translated tokens
ar_values_to_update = []
for key in join_lookup.keys():
    update_rec = {'id':key}
    all_vals = []
    for token_rec in sorted(list(join_lookup[key])):
        try:
            all_vals += token_lookup[token_rec]
        except:
            pass
    if len(all_vals) == 0:
        continue
    
    google_trans = ", ".join(all_vals)
    update_rec['google_tokens_joined'] = google_trans
    ar_values_to_update.append(update_rec)

In [9]:
# Get new database connections just to make sure we have the proper schema for tables
del db
del tab_arabic_values
del tab_arabic_tokens
del tab_arabic_values_tokens

db = dataset.connect("sqlite:///sams_data_phase19.sqlite")
tab_arabic_values = db['arabic_values']

In [10]:
# Create an in-memory representation of the arabic_values table
data = [x for x in tab_arabic_values.all()]

In [11]:
# Verify the contents
data[:2]

[OrderedDict([('id', 1),
              ('arabic', 'قبول عابر'),
              ('google_translate', 'Transient admission'),
              ('human_translate', 'monitoring'),
              ('normalized', None),
              ('appears_in',
               "['acceptance_pattern', 'diagnosis', 'treatment']"),
              ('google_translate_feb', 'Transient admission')]),
 OrderedDict([('id', 2),
              ('arabic', 'حواضن'),
              ('google_translate', 'Cushions'),
              ('human_translate', 'incubators'),
              ('normalized', None),
              ('appears_in',
               "['acceptance_pattern', 'analysis_type', 'diagnosis', 'treatment']"),
              ('google_translate_feb', 'Cushions')])]

In [12]:
# Create a dictionary representation of it
data_dict = {}
for row in data:
    data_dict[row['id']] = row

In [13]:
# Now update the data_dict with the joined tokens
# Try to reproduce the arabic_values table with the new col so we can drop and bulk update
# because it is a lot faster than updating rows individually
# Skipping some PII that don't have records in arabic_values anymore but are still in tokens - will clean up next

for rec in ar_values_to_update:
    try:
        data_dict[rec['id']]['google_tokens_joined'] = rec['google_tokens_joined']
    except:
        pass

In [14]:
# Check this matches the number of records in the arabic_values table
len(data_dict)

337278

In [15]:
# Drop the original arabic_values table
tab_arabic_values.drop()
del tab_arabic_values

In [16]:
# Get a new db connection to be safe
del db
db = dataset.connect("sqlite:///sams_data_phase19.sqlite")
tab_arabic_values = db['arabic_values']

In [17]:
# Look at the first row
# Delete the id column so that dataset will automatically create it with the proper type
del data_dict[1]['id']
data_dict[1]

OrderedDict([('arabic', 'قبول عابر'),
             ('google_translate', 'Transient admission'),
             ('human_translate', 'monitoring'),
             ('normalized', None),
             ('appears_in',
              "['acceptance_pattern', 'diagnosis', 'treatment']"),
             ('google_translate_feb', 'Transient admission'),
             ('google_tokens_joined', 'transient admission')])

In [18]:
# Insert one row to generate the schema, then bulk insert the rest
tab_arabic_values.insert(data_dict[1],ensure=True)

1

In [19]:
# Verify that the table structure is correct in the database then bulk insert the rest. Delete record 1 from the data_dict first.
del data_dict[1]

In [20]:
# Make sure you get a keyerror here
# data_dict[1]

In [21]:
# Now bulk insert the rest
rows_to_insert = [data_dict[k] for k in sorted(list(data_dict.keys()))]

# Make sure they have the 'google_tokens_joined' key
for row in rows_to_insert:
    if 'google_tokens_joined' not in row.keys():
        row['google_tokens_joined'] = None

rows_to_insert[:1]

[OrderedDict([('id', 2),
              ('arabic', 'حواضن'),
              ('google_translate', 'Cushions'),
              ('human_translate', 'incubators'),
              ('normalized', None),
              ('appears_in',
               "['acceptance_pattern', 'analysis_type', 'diagnosis', 'treatment']"),
              ('google_translate_feb', 'Cushions'),
              ('google_tokens_joined', 'cushions')])]

In [22]:
tab_arabic_values.insert_many(rows_to_insert)

In [23]:
# check number of records
tab_arabic_values.count()

337278

In [24]:
# Remove some PII that sneaked through in an unlabeled column: 1653-1677 and 1622-1651
# Store values here for scrubbing the raw tables
arabic_pii_values = []

for n in range(1622,1652):
    try:
        rec = tab_arabic_values.find_one(id=n)
        arabic_pii_values.append(rec['arabic'])
        tab_arabic_values.delete(id=n)
    except:
        pass
    
for n in range(1653,1678):
    try:
        rec = tab_arabic_values.find_one(id=n)
        arabic_pii_values.append(rec['arabic'])
        tab_arabic_values.delete(id=n)
    except:
        pass

In [25]:
# Look at the full_raw_scrubbed table for a match to the pii and then scrub if there
# Keep in mind that full_raw_english maintains parity
results = db.query("SELECT id, col_none FROM full_raw_scrubbed WHERE col_none IS NOT NULL AND col_none <> '.';")

In [26]:
full_records_to_scrub = []
for rec in results:
    v = rec['col_none']
    if v in arabic_pii_values:
        full_records_to_scrub.append(rec['id'])

In [27]:
len(full_records_to_scrub)

124

In [28]:
# These tables are in parity with each other
tab_raw_ar = db['full_raw_scrubbed']
tab_raw_en = db['full_raw_english']

# Do not save this value in a source code repository!
salt = 'REDACTED'.encode()

for rec_id in full_records_to_scrub:
    rec = tab_raw_ar.find_one(id=rec_id)
    pii = rec['col_none']
    
    # Create the hashed value
    h = hashlib.sha256()
    h.update(pii.encode())
    h.update(salt)
    
    # Create an update rec with the hashed value
    update_rec = {'id':rec_id, 'col_none': h.hexdigest()}
    
    # update the values in the tables
    tab_raw_ar.update(update_rec, ['id'])
    tab_raw_en.update(update_rec, ['id'])

In [29]:
# Get token ids corresponding with the PII.
arabic_value_pii_ids = [x for x in range(1622,1652)] + [x for x in range(1653,1678)]
arabic_value_pii_ids_str = "(" + ",".join([str(x) for x in arabic_value_pii_ids]) + ")"
results = db.query("SELECT token_id FROM arabic_values_tokens WHERE arabic_values_id IN " + arabic_value_pii_ids_str + " GROUP BY token_id ORDER BY token_id ASC;")

In [30]:
token_ids = [r['token_id'] for r in results]
token_ids_str = "(" + ",".join([str(x) for x in token_ids]) + ")"

# Figure out which tokens should be deleted
results = db.query("SELECT * FROM arabic_tokens WHERE id IN " + token_ids_str + " ORDER BY id ASC")

In [31]:
# These will print out PII so they were commented out after examination
# for r in results:
#     print(r['id'], r['translation'], r['google_translate_feb'])
    
# After investigating results, we want to delete the tokens with id numbers:
token_ids_to_remove = [1379, 1383, 1385, 1386, 1387, 1389, 1390, 1391, 1392,
                       1393, 1394, 1395, 1397, 1398, 1399, 1400, 1401, 1402,
                       1403, 1404, 1405, 1406, 1407, 1408, 1409, 1410, 1411,
                       1412, 1413, 1416, 1417, 1418]

In [32]:
# Delete the tokens that correspond to the now-missing arabic_values records
token_ids_to_remove_str = "(" + ",".join([str(x) for x in token_ids_to_remove]) + ")"

rec = db.query("DELETE FROM arabic_tokens WHERE id IN " + token_ids_to_remove_str + ";")

# Delete the entries from the join table that have the tokens referenced
rec2 = db.query("DELETE FROM arabic_values_tokens WHERE token_id IN " + token_ids_to_remove_str + ";")

# Delete the entries from the join table that have the arabic_values referenced
rec3 = db.query("DELETE FROM arabic_values_tokens WHERE arabic_values_id IN " + arabic_value_pii_ids_str + ";")

When the `arabic_values table` was created, the values that did not require translation -- numbers, English, etc... were excluded. This pulls them back into the `arabic_values` table so that they can be used for flag generation.

In [33]:
# Get the fields from the full_raw_scrubbed table
rec = tab_raw_ar.find_one()
variables = [k for k in list(rec.keys()) if 'info_' not in k and 'flag_' not in k and "a_" not in k and k != 'id']

In [34]:
# Get all values from arabic_values
values_ref = set([r['arabic'] for r in tab_arabic_values.find()])

In [35]:
missing_values = set()
for r in tab_raw_ar.find():
    for v in variables:
        if r[v] is None or r[v] == '.' or r[v] in values_ref:
            continue
        else:
            missing_values.add(r[v])
            
len(missing_values)

30502

In [36]:
# Add the values to arabic_values in the `orig_value` column
new_recs = []
for mv in sorted(list(missing_values)):
    rec = {'orig_value':mv}
    new_recs.append(rec)
    
# Insert one first to create the new column with dataset
tab_arabic_values.insert(new_recs[0])

# Now bulk insert the rest
tab_arabic_values.insert_many(new_recs[1:])

The database now is ready for flag generation. Keep in mind that the `google_translate_feb` and `google_tokens_joined` fields in the `arabic_values` table now also can be searched for flag terms. You can also search `orig_value`.

In [3]:
# Copy the database to the next template
# Convert this to a translation import notebook and move flag work to subsequent notebook

# Copy the database over as the template for the next file.
# This Notebook did not include manual editing of the data.

# Do not rerun this cell or you will overwrite the next template!
# shutil.copy2('sams_data_phase19.sqlite','sams_data_phase20_template.sqlite')

'sams_data_phase20_template.sqlite'