In [1]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# Regular expressions
import re

# Display errors in realtime
import ipywidgets as widgets

# work with dates
import datetime
import arrow
import time

# For scrubbing PII
import hashlib

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Import Pandas for easy importing of an excel file
import csv

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase08.sqlite")
    print("Removed template clone sams_data_phase08.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase08_template.sqlite","sams_data_phase08.sqlite")
    
    print("Created database from template: sams_data_phase08.sqlite")
except:
    pass

Removed template clone sams_data_phase08.sqlite
Created database from template: sams_data_phase08.sqlite


In [3]:
# Translation file from Houssom

translation_file = "split_arabic_phrases_03_OCT_2017_Ongoing Tranlation_Original_V2.csv"
data = []
with open(translation_file,'r', encoding='utf-8') as csvfile:
    csvreader = csv.reader(csvfile)
    header = next(csvreader)
    header[0] = 'id'
    for row in csvreader:
        rowdict = zip(header,row)
        data.append(dict(rowdict))

In [5]:
# Convert some values to integers and use None for empty strings to save database space

clean_data = []
for r in data:
    try:
        r['id'] = int(r['id'])
        r['occurrences'] = int(r['occurrences'])
        for key in r.keys():
            try:
                if r[key].strip() == '':
                    r[key] = None
            except:
                pass
        try:
            del r['Number of Characters']
        except:
            pass
        clean_data.append(r)
    except:
        pass

In [21]:
db = dataset.connect("sqlite:///sams_data_phase08.sqlite")

In [22]:
# Replace the arabic_tokens table with the translation data, which includes
# all of the original arabic_tokens data along with additional metadata from Excel
tab_arabic_tokens = db['arabic_tokens']
tab_arabic_tokens.drop()

True

In [23]:
tab_arabic_tokens = db['arabic_tokens']
tab_arabic_tokens.insert_many(clean_data)

After a review of the data, the best approach here is probably to take the terms that were split out of the original Arabic Values and join them back together, comma separated, into the "Human Translation" column of the Arabic Values table. Given that those map back to the raw data, it will give an approximation of how well we can provide a human translated glimpse of the raw source.

In [24]:
tab_arabic_values_tokens = db['arabic_values_tokens']

In [26]:
join_lookup = {}

for record in tab_arabic_values_tokens.find():
    if record['arabic_values_id'] not in join_lookup.keys():
        join_lookup[record['arabic_values_id']] = set()
    join_lookup[record['arabic_values_id']].add(record['token_id'])

In [86]:
token_lookup = {}

for record in tab_arabic_tokens.find():
    translation = record['translation']
    
    # Nothing to join back
    if translation is None:
        continue
    else:
        translation = [x.lower().strip() for x in translation.split(",") if x.strip() != '']
        
    token_lookup[record['id']] = translation

In [92]:
ar_values_to_update = []
for key in join_lookup.keys():
    update_rec = {'id':key}
    all_vals = []
    for token_rec in sorted(list(join_lookup[key])):
        try:
            all_vals += token_lookup[token_rec]
        except:
            pass
    if len(all_vals) == 0:
        continue
    
    hum_trans = ", ".join(all_vals)
    update_rec['human_translate'] = hum_trans
    ar_values_to_update.append(update_rec)

In [94]:
tab_arabic_values = db['arabic_values']

In [95]:
for rec in ar_values_to_update:
    tab_arabic_values.update(rec,['id'])

In [96]:
# Copy the database over as the template for the next file.
# This Notebook did not include manual editing of the data.

# Do not rerun this cell!
shutil.copy2('sams_data_phase08.sqlite','sams_data_phase09_template.sqlite')

'sams_data_phase09_template.sqlite'