In [1]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# Regular expressions
import re

# Display errors in realtime
import ipywidgets as widgets

# work with dates
import datetime
import arrow
import time

# For scrubbing PII
import hashlib

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Import Pandas for easy importing of an excel file
import csv

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase16.sqlite")
    print("Removed template clone sams_data_phase16.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase16_template.sqlite","sams_data_phase16.sqlite")
    
    print("Created database from template: sams_data_phase16.sqlite")
except:
    pass

Removed template clone sams_data_phase16.sqlite
Created database from template: sams_data_phase16.sqlite


In [3]:
db = dataset.connect("sqlite:///sams_data_phase16.sqlite")

In [4]:
# Import the translation file from Houssam
# Translation file from Houssom

translation_file = "split_arabic_phrases_01_Dec_2017_Categorized V4_01-18-2018.csv"

data = []
with open(translation_file,'r', encoding='utf-8') as csvfile:
    csvreader = csv.reader(csvfile)
    header = next(csvreader)
    header[0] = 'id'
    for row in csvreader:
        rowdict = zip(header,row)
        data.append(dict(rowdict))
        
# Convert some values to integers and use None for empty strings to save database space

clean_data = []
for r in data:
    try:
        r['id'] = int(r['id'])
        r['occurrences'] = int(r['occurrences'])
        for key in r.keys():
            if key.strip() == '':
                continue
            try:
                if r[key].strip() == '':
                    r[key] = None
            except:
                pass
        try:
            del r['Number of Characters']
        except:
            pass
        
        # Sometimes the CSV imports with a blank column on the left
        # that needs to be deleted from the dict created from it
        try:
            del r['']
        except:
            pass
        
        clean_data.append(r)
    except:
        pass
    
# Replace the arabic_tokens table with the translation data, which includes
# all of the original arabic_tokens data along with additional metadata from Excel
tab_arabic_tokens = db['arabic_tokens']
tab_arabic_tokens.drop()

tab_arabic_tokens = db['arabic_tokens']
tab_arabic_tokens.insert_many(clean_data)

tab_arabic_values_tokens = db['arabic_values_tokens']

join_lookup = {}

for record in tab_arabic_values_tokens.find():
    if record['arabic_values_id'] not in join_lookup.keys():
        join_lookup[record['arabic_values_id']] = set()
    join_lookup[record['arabic_values_id']].add(record['token_id'])
    
# There are errant commas in the import file and this attempts to remove them,
# though it is possible is also is removing commas that belong. 
token_lookup = {}

for record in tab_arabic_tokens.find():
    translation = record['translation']
    
    # Nothing to join back
    if translation is None:
        continue
    else:
        translation = [x.lower().strip() for x in translation.split(",") if x.strip() != '']
        
    token_lookup[record['id']] = translation
    
ar_values_to_update = []
for key in join_lookup.keys():
    update_rec = {'id':key}
    all_vals = []
    for token_rec in sorted(list(join_lookup[key])):
        try:
            all_vals += token_lookup[token_rec]
        except:
            pass
    if len(all_vals) == 0:
        continue
    
    hum_trans = ", ".join(all_vals)
    update_rec['human_translate'] = hum_trans
    ar_values_to_update.append(update_rec)
    
tab_arabic_values = db['arabic_values']

for rec in ar_values_to_update:
    try:
        tab_arabic_values.update(rec,['id'])
    except:
        print(rec)

In [5]:
clean_data[:5]

[{'Notes': None,
  'Spelling Correction': None,
  'Translator': None,
  'id': 1,
  'occurrences': 4,
  'token': 'قبول عابر',
  'translation': 'Monitoring'},
 {'Notes': None,
  'Spelling Correction': None,
  'Translator': None,
  'id': 2,
  'occurrences': 3,
  'token': 'حواضن',
  'translation': 'incubators'},
 {'Notes': None,
  'Spelling Correction': None,
  'Translator': None,
  'id': 3,
  'occurrences': 35,
  'token': 'استشفاء',
  'translation': 'recovery'},
 {'Notes': None,
  'Spelling Correction': None,
  'Translator': None,
  'id': 4,
  'occurrences': 8,
  'token': 'عناية',
  'translation': 'ICU'},
 {'Notes': None,
  'Spelling Correction': None,
  'Translator': None,
  'id': 5,
  'occurrences': 10,
  'token': 'اختبار الزمرة الدموية',
  'translation': ', blood typing '}]

In [6]:
# Convert this to a translation import notebook and move flag work to subsequent notebook

# Copy the database over as the template for the next file.
# This Notebook did not include manual editing of the data.

# Do not rerun this cell!
shutil.copy2('sams_data_phase16.sqlite','sams_data_phase17_template.sqlite')

'sams_data_phase17_template.sqlite'