In [1]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# Regular expressions
import re

# Display errors in realtime
import ipywidgets as widgets

# work with dates
import datetime
import arrow
import time

# For scrubbing PII
import hashlib

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase17a.sqlite")
    print("Removed template clone sams_data_phase17a.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase18_template.sqlite","sams_data_phase17a.sqlite")
    
    print("Created database from template: sams_data_phase17a.sqlite")
except:
    pass

Removed template clone sams_data_phase17a.sqlite
Created database from template: sams_data_phase17a.sqlite


In [3]:
db = dataset.connect("sqlite:///sams_data_phase17a.sqlite")

In [4]:
tab_arabic = db['arabic_values']

In [5]:
tab_eng_tokens = db['eng_tokens']
tab_arabic_values_eng_tokens = db['arabic_values_eng_tokens']

In [6]:
delimiters = "(", ")", ",", "/", ".", "1","2","3","4","5","6","7","8","9","0","+","_","-","\\","=","ـ"
regexpattern = "|".join(map(re.escape,delimiters))

In [7]:
good_char = 'ا'
bad_char1 = 'أ'
bad_char2 = 'إ'
bad_char3 = 'آ'

In [8]:
token_lookup = {}

for rec in tab_arabic.find():
    try:
        tokens = rec['google_translate']
        tokens = tokens.replace(bad_char1,good_char)
        tokens = tokens.replace(bad_char2,good_char)
        tokens = tokens.replace(bad_char3,good_char)

        tokens = [t.strip() for t in re.split(regexpattern,tokens) if t.strip() != '']

        for token in tokens:
            if token not in token_lookup.keys():
                token_lookup[token] = {"token_id":None, "arabic_values_id":[]}
            token_lookup[token]["arabic_values_id"].append(rec['id'])
    except:
        pass
        

In [9]:
len(list(token_lookup.keys()))

208011

In [10]:
tokens_to_insert = []
for token in token_lookup.keys():
    tokens_to_insert.append({"token":token})

In [11]:
tokens_to_insert[:5]

[{'token': 'Transient admission'},
 {'token': 'Cushions'},
 {'token': 'recovery'},
 {'token': 'Attention'},
 {'token': 'Blood group test'}]

In [12]:
tab_eng_tokens.insert_many(tokens_to_insert)

In [13]:
for rec in tab_eng_tokens.find():
    token_lookup[rec['token']]["token_id"] = rec['id']

In [14]:
join_records = []
for token in token_lookup.keys():
    for orig_id in token_lookup[token]["arabic_values_id"]:
        join_records.append({"token_id":token_lookup[token]["token_id"],"arabic_values_id":orig_id})

In [15]:
len(join_records)

689501

In [16]:
tab_arabic_values_eng_tokens.insert_many(join_records)