In [1]:
# Manipulate the file system
import os
import shutil
import datetime
import arrow

import hashlib

# Display errors in realtime
import ipywidgets as widgets
import time
import re

# This used to be a part of dataset but was extracted to its own library
# https://github.com/pudo/datafreeze
from datafreeze import freeze

# Export database table to CSV
import csv

# Copy dictionaries
import copy

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import dataset
import sqlalchemy

# Enables opening and reading of Excel files
import openpyxl

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
turkey_file = "turkey_data/turkey_to_import.xlsx"

In [3]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start
new_db_name = "sams_data_phase24.sqlite"

try:
    os.remove(new_db_name)
    print("Removed template clone ", sams_data_phase23.sqlite)
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase23_output_2018-04-07.sqlite",new_db_name)
    
    print("Created database from template: ", new_db_name)
except:
    pass

Created database from template:  sams_data_phase24.sqlite


In [4]:
db = dataset.connect("sqlite:///" + new_db_name)

The file from Turkey covers many months and facilities. To support importing it to the same schema, I'm creating dummy records in the files table and inserting a few unknown facility codes.

In [5]:
tab_facilities = db['facilities']

In [6]:
tab_locations = db['locations']

In [7]:
country_rec = {"location_name":"Unknown", "level": "country", "parent_id":None}
loc_id = tab_locations.insert(country_rec)
loc_id

114

In [8]:
gov_rec = {"location_name":"Unknown", "level": "governorate", "parent_id":114}
gov_id = tab_locations.insert(gov_rec)

In [9]:
gov_id

115

In [10]:
dis_rec = {"location_name":"Unknown", "level": "district", "parent_id":115}
dis_id = tab_locations.insert(dis_rec)
dis_id

116

In [11]:
subd_rec = {"location_name":"Unknown", "level": "subdistrict", "parent_id":116}
subd_id = tab_locations.insert(subd_rec)
subd_id

117

In [12]:
new_facs = [
    {
        "facilityname": "Administrators",
        "country": "Unknown",
        "governorate":"Unknown",
        "district":"Unknown",
        "subdistrict":"Unknown",
        "country_id":114,
        "governorate_id":115,
        "district_id":116,
        "subdistrict_id":117,
        "needs_review":1,
        "facility_code":"UNK001"
    },
    {
        "facilityname": "Al Zerbeh PHC",
        "country": "Unknown",
        "governorate":"Unknown",
        "district":"Unknown",
        "subdistrict":"Unknown",
        "country_id":114,
        "governorate_id":115,
        "district_id":116,
        "subdistrict_id":117,
        "needs_review":1,
        "facility_code":"UNK002"
    },
    {
        "facilityname": "test",
        "country": "Unknown",
        "governorate":"Unknown",
        "district":"Unknown",
        "subdistrict":"Unknown",
        "country_id":114,
        "governorate_id":115,
        "district_id":116,
        "subdistrict_id":117,
        "needs_review":1,
        "facility_code":"UNK003"
    }
]

for f in new_facs:
    tab_facilities.insert(f)

### Load the data file

In [13]:
wb = openpyxl.load_workbook(turkey_file, read_only=True)

In [14]:
sheet = wb.sheetnames[0]
sheet

'data'

In [15]:
sheet = wb.get_sheet_by_name('data')
header = sheet["A1":"X1"]
header = [c.value for c in header[0]]
header

['num',
 'facility',
 'facility_code',
 'first_name',
 'last_name',
 'age',
 'age_type',
 'date',
 'month',
 'year',
 'sex',
 'idp_host',
 'new_or_followup',
 'disability',
 'nationality',
 'outcome',
 'drugs',
 'injury_cause',
 'injury_new_or_old',
 'icd10_code',
 'sams_diag_code',
 'diagnosis_arabic',
 'diagnosis_english',
 'info_doctor']

Last row of data is 910273

In [16]:
data = sheet["A2":"X910273"]

In [17]:
files_needed = set()

for row in data:
    t = (row[2].value, row[8].value, row[9].value)
    files_needed.add(t)

In [18]:
list(files_needed)[:5]

[('SAMS003', 8, 2017),
 ('SAMS191', 10, 2017),
 ('SAMS049', 11, 2017),
 ('SAMS057', 10, 2017),
 ('SAMS061', 7, 2017)]

In [19]:
fac_lookup = {}
for rec in tab_facilities.find():
    fac_lookup[rec['facility_code']] = rec 

In [20]:
tab_files = db['files']

In [21]:
for rec in files_needed:
    new_rec = {
        "file_name":"Patient records from Jun until Dec 2017, Northern Syria.xlsx",
        "path":turkey_file,
        "country":fac_lookup[rec[0]]['country'],
        "year":rec[2],
        "num_sheets":1,
        "sheet_names":"['data']",
        "facility_id":fac_lookup[rec[0]]['id'],
        "month":rec[1],
        "added":'2018-04-08',
        "info":"DUMMY FILE TURKEY"
    }
    tab_files.insert(new_rec)

### Handle Variables

These are dummy files so we don't really need to track the mapping of them to the files.

In [22]:
tab_vars = db['variables']
var_lookup = {rec['orig']:rec['id'] for rec in tab_vars.find()}

In [23]:
# Note that we create the set() variables two cells above, when looking for headers
variables = set(header)

for v in variables:
    if v not in var_lookup.keys():
        new_var_rec = {'orig':v, 'translation':v, 'normalized': v, 'added':'2018-04-08' }
        tab_vars.insert(new_var_rec)
        print(v)

diagnosis_arabic
facility
sams_diag_code
info_doctor
drugs
nationality
first_name
outcome
num
injury_cause
facility_code
idp_host
icd10_code
last_name
date
new_or_followup
diagnosis_english
injury_new_or_old
year
age_type
month
disability


In [24]:
fix_vars = {
    "first_name":"info_name",
    "last_name":"info_name"    
}

for k,v in fix_vars.items():
    db.query("UPDATE variables SET normalized = '" + v + "' WHERE normalized = '" + k + "';")

In [25]:
var_lookup = {rec['orig']:rec['id'] for rec in tab_vars.find()}

### Turkey Note

We already hold the data in memory at this point so we do not need the files_sheets_id for the dummy records. Will create it later if that turns out to be a problem.

In [26]:
tab_all = db['full_raw_scrubbed']
tab_vars = db['variables']
tab_files_sheets = db['files_sheets']

In [27]:
# We need to create some columns...
raw_columns = [k for k in tab_all.find_one().keys()]
flag_cols = [c for c in raw_columns if 'flag_' in c]
col_names_we_need = sorted(list(set([r['normalized'] for r in tab_vars.find()])))
missing_col_names = set(col_names_we_need) - set(raw_columns)
missing_col_names

{'age_type',
 'diagnosis_arabic',
 'diagnosis_english',
 'disability',
 'drugs',
 'facility',
 'facility_code',
 'icd10_code',
 'idp_host',
 'info_doctor',
 'injury_cause',
 'injury_new_or_old',
 'nationality',
 'num',
 'sams_diag_code'}

In [28]:
# Create the new columns

for c in missing_col_names:
    tab_all.create_column(c, sqlalchemy.String)

In [29]:
# Rename var_lookup to work with code below

# Create an in-memory lookup table for variables
var_lookup = {}
for r in tab_vars.find():
    var_lookup[r['orig']] = r['normalized']

In [30]:
dummy_file_lookup = {}
dummy_files = db.query("SELECT * FROM files WHERE added = '2018-04-08';")
for rec in dummy_files:
    dummy_file_lookup[(rec['facility_id'],rec['year'],rec['month'])] = rec['id']

In [31]:
zipped_data = []
for rec in data:
    values = []
    for v in rec:
        if v.value == 'NULL':
            values.append(None)
        else:
            values.append(v.value)
    
    vdict = dict(zip(header,values))
    
    facility_id = fac_lookup[vdict['facility_code']]['id']
    
    file_id = dummy_file_lookup[(facility_id,str(vdict['year']),vdict['month'])]

    
    del vdict['facility']
    del vdict['facility_code']
    del vdict['year']
    del vdict['month']
    
    new_vdict = {}
    for k in vdict.keys():
        new_vdict[var_lookup[k]] = vdict[k]
    
    new_vdict['a_file_id'] = file_id
    new_vdict['a_files_sheets_id'] = None
    new_vdict['a_sheet_id'] = None
    new_vdict['added'] = '2018-04-08'
    
    zipped_data.append(new_vdict)

### Scrub PII

In [32]:
# Do not save this value in a source code repository!
salt = 'REDACTED'.encode()

In [33]:
fields = [
    "info_name",
    "info_doctor",
    "info_name_author",
    "info_name_caregiver",
    "info_name_facility",
    "info_name_group",
    "info_name_of_coach",
    "info_name_processor",
    "info_name_surgeon",
    "info_phone_skype"
]

In [35]:
for rec in zipped_data:
    for pii_field in fields:
        if pii_field in rec.keys():
            if rec[pii_field] is None or rec[pii_field] == '.':
                continue
            else:
                # Hash the value in the field
                
                h = hashlib.sha256()
                h.update(str(rec[pii_field]).encode())
                h.update(salt)
                rec[pii_field] = str(h.hexdigest())


In [36]:
zipped_data[0]

{'a_file_id': 1199,
 'a_files_sheets_id': None,
 'a_sheet_id': None,
 'added': '2018-04-08',
 'age_type': 'شهر',
 'date': datetime.datetime(2017, 10, 30, 0, 0),
 'diagnosis_arabic': 'متلازامات الصداع الاخرى ',
 'diagnosis_english': 'Other headache syndromes',
 'disability': None,
 'drugs': None,
 'icd10_code': 'G44',
 'idp_host': 'مقيم',
 'info_age': 6,
 'info_doctor': 'b0eeb75eef72114bf0976275a34239726a953db76c1ba33a59c12b1fc29b5065',
 'info_name': 'cde2ad8a09a3ffb969aa3c1522850a78b2389507c067f4787d7b58c6e1eade23',
 'info_sex': 'ذكر',
 'injury_cause': None,
 'injury_new_or_old': None,
 'nationality': 'سوري',
 'new_or_followup': 'جديد',
 'num': 1537919,
 'outcome': 'قبول غير جراحي /   Non-surgical ',
 'sams_diag_code': 'N6C'}

In [37]:
tab_all.insert_many(zipped_data)

In [38]:
# clear some memory

del data
del zipped_data

### Export to Arabic Values

Including English so that we can use code written previously

In [42]:
tab_raw = db['full_raw_scrubbed']
tab_arabic = db['arabic_values']
tab_vars = db['variables']

column_names = db.query("SELECT DISTINCT(normalized) FROM variables;")
column_names = sorted([r['normalized'] for r in column_names])

# We don't want to work with the values in the fields that have been hashed,
# so remove them from the list of variables to query.
fields = [
    "info_name",
    "info_name_author",
    "info_doctor",
    "info_name_caregiver",
    "info_name_facility",
    "info_name_group",
    "info_name_of_coach",
    "info_name_processor",
    "info_name_surgeon",
    "info_phone_skype",
    "date",
    "date_first_exam",
    "death_date",
    "date_admission",
    "num",
    "sams_diag_code",
    "icd10_code",
    "facility",
    "nationality",
    "facility_code",
    "added"
]
column_names = [e for e in column_names if e not in fields]

arabic_lookup = set([r['arabic'] for r in tab_arabic.find()])

In [43]:
max_id_results = db.query("SELECT max(id) FROM arabic_values;")
for r in max_id_results:
    max_id = int(r['max(id)'])

current_id = max_id + 1

In [44]:
buffer = []
for col in column_names:
    col_values = db.query("""
        SELECT DISTINCT([""" + col + """]) 
        FROM full_raw_scrubbed 
        WHERE added = '2018-04-08'
        AND [""" + col + """] IS NOT NULL
        AND [""" + col + """] <> '.'
        AND [""" + col + """] <> '';
        """)
    col_values = [r[col] for r in col_values]

    # Create a table of unique Arabic values
    for v in col_values:
        if v in arabic_lookup:
            continue
        # Skip numbers
        if v.replace(",",".").replace('.','',1).isdigit():
            continue
        else:
            r = {"id":current_id,"arabic":v,"added":'2018-04-08'}
            current_id += 1
            buffer.append(r)
            arabic_lookup.add(v)
            
# 367621

tab_arabic.insert_many(buffer)

### Translate the new values

In [46]:
recs = [r for r in tab_arabic.find(added='2018-04-08')]

In [47]:
translate_client = translate.Client()
target_lang = 'en'

In [48]:
record_counter = widgets.HTML(value="Records: 0",continuous_update=True)
character_counter = widgets.HTML(value="Characters: 0",continuous_update=True)
error_counter = widgets.HTML(value="Errors: 0",continuous_update=True)

In [52]:
character_count = 0
record_count = 0
error_count = 0

print("Total Records",len(recs))

updates = []

for rec in recs:
    try:
        arabic = rec['arabic']
        english = translate_client.translate(arabic, target_language=target_lang)
        english = english['translatedText']
        update_rec = {
            "id":rec['id'],
            "google_translate_feb":english
        }
        updates.append(update_rec)
        
        character_count += len(arabic)
        character_counter.value = "Characters: " + str(character_count)
        record_count += 1
        record_counter.value = "Records: " + str(record_count)
    except:
        error_count += 1
        error_counter.value = "Errors: " + str(error_count)
        time.sleep(10)
        
print("--------------\nDone")
print("Record Count",record_count)
print("Character Count",character_count)

Total Records 41353
--------------
Done
Record Count 41353
Character Count 1210616


In [49]:
record_counter

In [50]:
character_counter

In [51]:
error_counter

In [53]:
updates[0]

{'google_translate_feb': 'Other headaches', 'id': 400448}

In [54]:
for rec in updates:
    tab_arabic.update(rec,['id'])

### Flag Generation

In [55]:
# Get a reference to the arabic_values table
tab_arabic_values = db['arabic_values']

In [56]:
try:
    db['full_raw_flags'].drop()
    print("Dropped full_raw_flags")
except:
    pass

try:
    db['full_raw_flags_reduced'].drop()
    print("Dropped full_raw_flags_reduced").drop()
except:
    pass

# One SQLite limitation is you cannot drop columns, so you have to create a new table and then rename it.
preserve_fields = [k for k in tab_arabic_values.find_one().keys() if 'flag_' not in k]

# We don't use result but assigning it skips printing some garbage below
result = db.query("""
CREATE TABLE new_arabic_values AS 
    SELECT """ + ",".join(preserve_fields) + """ 
    FROM arabic_values;
""")

# Drop the original arabic_values table
tab_arabic_values.drop()

# Rename new_arabic_values to arabic_values & now we have a table with no flag columns
result = db.query("""
ALTER TABLE new_arabic_values RENAME TO arabic_values;
""")

Dropped full_raw_flags
Dropped full_raw_flags_reduced


In [57]:
# Now because we futzed with the arabic_values table, we have to create a new reference to the database
# and to our arabic_values table. The db object stores some schema information that isn't updated with
# our direct query calls above.

del db
del tab_arabic_values

db = dataset.connect("sqlite:///" + new_db_name)
tab_arabic_values = db['arabic_values']

In [58]:
# Now create an in-memory representation of the arabic_values table
# and store it in variable `data`
data = [x for x in tab_arabic_values]

In [59]:
len(data)

412605

In [60]:
# Update this if you want to change what flags you are making on the dataset.
# The logic for creating them is in the following cell.

# Require and flag term
flag_terms = [
    "blunt",
    "explosive",
    "blast",
    "stab",
    "upper extremity",
    "lower extremity",
    "neck",
    "chest",
    "back",
    "spinal",
    "neurologic",
    "nerve",
    "vascular",
    "orthopedic",
    "fracture",
    "suspected",
    "follow-up",
    "complication",
    "history of",
    "traffic accident"
]

# require all terms - not in use at the moment
multiple_flag_terms = [
#     ("burn","fracture")
]

# require any of the terms but name the flag after the first
synonym_flag_terms = [
    ("allergy", "allergic"),
    ("anemia", "thalassemia"),
    ("cancer", "bcc", "leukemia", "lymphoma", "malignancy", "malignant", "scc"),
    ("cardiovascular"," asd "," vsd ","cholesterol","hypercholesterolemia","hyperlipidemia","hypertriglyceridemia","triglycerides","blood pressure"," bp ","high blood pressure","hypertension","acute coronary syndrome","angina","arrhythmia","atrial fibrillation"," avr ","cardiac ischemia","chest pain","clot","clotting","coronary atery","coronary heart disease","coronary ischemia","dvt","endocarditis","heart attack","heart disease","heart failure","heart valve","hf","hypotension","ihd"," mi ","mitral valve prolapse","mvr","myocardial hypoperfusion","myocardial infarction","palpitations","pericarditis","pulmonary embolism","pvd","svt","thromboembolism","thrombophlebitis","thrombosis","vasculitis"),
    ("congenital", "asd", "vsd"),
    ("dehydration", "dehydration", "hypovolemic shock"),
    ("dental complaint", "dental", "gingivitis", " gum ", "odonitis", "teeth", "tooth", "toothache"),
    ("derm", "acne","alopecia","blisters","cellulitis","dermatitis","dermoid","dry skin","eczema","folliculitis","hair loss","inflammatory papules","intertrigo","itch","lice","pruritis","psoriasis","rash","ringworm","scabies","skin disease","skin disorder","skin eruption","skin infection","skin lesion","tinea","warts"),
    ("diabetes","diabetic","DKA","glucose","hyperglycemia","hypoglycemia","sugar"),
    ("endocrine","hyperthyroid","hyperthyroidism","hypocalcemia","hypothyroid","hypothyroidism","parathyroid","thyroid"," TSH "),
    ("infection","conjunctivitis","eye discharge","eye infection","keratoconjunctivitis","ophthalmic infection"),
    ("pain", "corneal inflammation", "eye sensitivity", "keratitis", "pain in the eye"),
    ("fatigue", "exhaustion", "tired", "tiredness"),
    ("fever", "hyperthermia", "temperature"),
    ("constipation", "intestinal stasis"),
    ("shrapnel", "fragments","sliver","splinter"),
    ("musculoskeletal pain","ankylosing spondylitis","arthralgia","Arthritis","back pain","bruise","bruising","chondritis","contusion","costochondritis","disc herniation","disc herniation","discitis","elbow pain","extremity pain","gout","inflammation of the shoulder","joint","knee degeneration","knee inflammation","knee pain","loin pain","low back pain","lumbar pain","musclar pain","Muscle spasm","muscular pain","myalgia","myositis","neck pain","osteoarthritis","osteomyelitis","osteomylitis","plantar fasciitis","polyarthritis","rheumatism","sacroiliitis","spine degeneration","sprain","strain","synovitis","tendinitis","tendonitis","tendonopathy","tmj"),
    ("headache", "head pain"),
    ("stroke","cerebral accident","cerebral hemorrhage","cerebral infarction","cerebral ischemia","cerebrovascular accident"," cva "),
    ("gunshot", " shot "),
    
    # Prior flags, preserved
    ("facial","face"),
    ("pelvic","pelvis"),
    ("head","eye","ear","face","brain","scalp","mouth","nose"),
    ("spine","spinal"),
    ("abdomen","abdominal")
]

# require the first term and the absence of the remaining terms
# name the flag after the first term.
complex_flag_terms = [
    ("urologic","neurologic"),
    ("burn","heartburn"),
    ("trauma", "psychological trauma")
]

# Look for any of the terms in terms_to_find but only apply if terms in terms_to_avoid are absent.
# Check human or google translation (ht, gt)

complex_set_flag_terms = [
    {
        "flag_name": "hyperlipidemia",
        "terms_to_find": ["blood pressure", "bp", "high blood pressure", "hypertension"],
        "terms_to_avoid": ["hypotension"],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "ENT",
        "terms_to_find": ["adenoiditis","ear congestion","ear discharge","ear infection","ear inflammation","eustachian tube infection","mucositis","mumps","nasal congestion","nose congestion","otitis","otorrhea","pharyngitis","throat ache","tonsillitis","tonsils enlargement","cerumen impaction","dysphagia","earache","epistaxis","hearing impairment","hearing loss","nasal obstruction","pain in the ear","pharyngeal pain","pharynx pain","swallowing pain","vestibulitis"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "infection",
        "terms_to_find": ["adenoiditis","ear congestion","ear discharge","ear infection","ear inflammation","eustachian tube infection","laryngitis","mucositis","mumps","nasal congestion","nose congestion","otitis","otorrhea","pharyngitis","rhinitis","sinusitis","throat ache","tonsillitis","tonsils enlargement"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "eye",
        "terms_to_find": ["conjunctivitis","eye discharge","eye infection","keratoconjunctivitis","ophthalmic infection","corneal inflammation","eye sensitivity","keratitis","pain in the eye","blepharitis","cataract","eye redness","eyelid","eye-redness","glaucoma","left eye","my eye","npdr","pterygium","pupil","redness of the eye","retinal","retinopathy","right eye","swelling of the eye","uveitis","vision"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "gi_complaint",
        "terms_to_find": ["abdominal injury","apendicitis","appendicitis","belly pain","bile duct obstruction","bile stones","cholecystitis","colic","colitis","colon spasm","Crohn","duodenal ulcer","enteritis","epigastric pain","flank pain","gallbladder inflammation","gastric pain","gastric ulcer","gastritis","gastroenteritis","gastrointestinal infection","hiatal hernia","ibd","ibs","indigestion","inflammation of the stomach","intestinal pain","intestinal ulcer","pain in the stomach","pancreatitis","peptic ulcer","peritoneal inflammation","peritonitis","sore stomach","stomach hurts","stomach pain","Digestive bleed","Gastric bleeding","Gastric hemorrhage","Gastrointestinal bleeding","hemorrhoids","Ulcer of the colon","Constipation","intestinal stasis","diarrhea","dysentery","food poisoning","giardia","typhoid","cirrhosis","hapatitis","hep a","hep b","hep c","hepatic","jaundice","nausea","vomiting","vomitting","anal fissure","bloating","celiac disease","esophageal reflux","gastroesophageal reflux","gerd","heartburn","inguinal hernia","malabsorption","umbilical fistula","umbilical hernia"],
        "terms_to_avoid": ["renal colic"],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "abdominal_pain",
        "terms_to_find": ["abdominal injury","apendicitis","appendicitis","belly pain","bile duct obstruction","bile stones","cholecystitis","colic","colitis","colon spasm","Crohn","duodenal ulcer","enteritis","epigastric pain","flank pain","gallbladder inflammation","gastric pain","gastric ulcer","gastritis","gastroenteritis","gastrointestinal infection","hiatal hernia","ibd","ibs","indigestion","inflammation of the stomach","intestinal pain","intestinal ulcer","pain in the stomach","pancreatitis","peptic ulcer","peritoneal inflammation","peritonitis","sore stomach","stomach hurts","stomach pain"],
        "terms_to_avoid": ["renal colic"],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "bleed",
        "terms_to_find": ["Digestive bleed","Gastric bleeding","Gastric hemorrhage","Gastrointestinal bleeding","hemorrhoids","Ulcer of the colon"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "diarrhea_dysentery",
        "terms_to_find": ["diarrhea","dysentery","food poisoning","giardia","typhoid"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "liver_dysfunction",
        "terms_to_find": ["cirrhosis","hapatitis","hep a","hep b","hep c","hepatic","jaundice"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "nausea_vomiting",
        "terms_to_find": ["nausea","vomiting","vomitting"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "gu",
        "terms_to_find": ["cystitis","dysuria","epididymitis","genital infection","herpes","orchitis","sexually transmitted infection","urethritis","urinary infection","Urinary tract infection","urogenital infection","UTI","bladder","hematuria","incontinence","pelvic mass","urinary disorder","urinary retention","urinary symptoms","varicocele"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "infection",
        "terms_to_find": ["cystitis","dysuria","epididymitis","genital infection","herpes","orchitis","sexually transmitted infection","urethritis","urinary infection","Urinary tract infection","urogenital infection","UTI"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "gyn_women",
        "terms_to_find": ["breast","endometriosis","fibroids","gynecological","hot flashes","irregular cycle","mastitis","menopause","menstrual","ovarian","ovary","ovulation","reproductive health","uterine","uterus","vagina","vaginal","vaginitis"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "injury",
        "terms_to_find": ["bite","sting","stinging","cut","wound","injury","blast","burn","fracture","gunshot","shot","hemiplegia","paralysis","paraplegia","quadriplegia","fragments","shrapnel","sliver","splinter","traffic accident","abrasion","bruise","bruising","Concussion","contusion","falling","knee rupture","splint","trauma"],
        "terms_to_avoid": ["psychological trauma","heartburn"],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "injury",
        "terms_to_find": ["ulcer"],
        "terms_to_avoid": ["gastric", "stomach", "peptic", "intestinal", "duodenal"],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "injury_neuro",
        "terms_to_find": ["hemiplegia","paralysis","paraplegia","quadriplegia"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "malnutrition",
        "terms_to_find": ["delayed growth","growth delay","growth retardation","short stature","malnutrition"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "growth_delay",
        "terms_to_find": ["delayed growth","growth delay","growth retardation","short stature"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "mental_health",
        "terms_to_find": ["anxiety","bipolar","mental illness","personality disorder","post traumatic syndrome","post-traumatic syndrome","psychiatric","psychological","ptsd","schizophrenia"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "mental_health",
        "terms_to_find": ["depression"],
        "terms_to_avoid": [],
        "check": ["ht"]
    },
    {
        "flag_name": "neuro_complaint",
        "terms_to_find": ["head pain","headache","cerebral accident","cerebral hemorrhage","cerebral infarction","cerebral ischemia","cerebrovascular accident","cva","stroke","benign paroxysmal postitional vertigo","brachial plexus","brain infection","brain tumor","cauda equina","cerebral palsy","cervical root","convulsion","convulsions","dementia","dizziness","encephalitis","epilepsy","epileptic","foot drop","hand drop","meningitis","meningocele","migraine","nerve","neuritis","neurodegenerative","neurological","neuropathy","numbness","nystagmus","polyneuritis","sciatica","seizure","subarachnoid hemorrhage","TIA","tinnitus","Vertigo"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "other_infection",
        "terms_to_find": ["mediterranean fever","mf","abcess","abscess","sepsis","septic shock","bacteremia","brucellosis","chickenpox","diphtheria","finger infection","foot infection","fungal","hand foot","hand infection","hand mouth","hand-foot","hookworm","infection of blood","intestinal worms","leprosy","lymphadenitis","lymphadenopathy","measles","nemotodes","omphalitis","parasite","pinworm","rheumatic fever","rubella","scarlet fever","thrush","toe infection","worms"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "other_infection",
        "terms_to_find": ["leishmania","leishmaniasis"],
        "terms_to_avoid": ["excluding leishmaniasis", "excluding leishmania", "except leishmaniasis"],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "pregnancy",
        "terms_to_find": ["abortion","antenatal","birth","caesarean section","csection","delivery","gestation","miscarriage","placenta","postnatal","postpartum","pregnancy","pregnant","prenatal"],
        "terms_to_avoid": ["not pregnant"],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "renal",
        "terms_to_find": ["hydronephrosis","kidney cysts","kidney failure","kidney stone","nephritis","nephrolithiasis","nephropathy","pyelonephritis","renal calculi","renal calculus","renal failure","renal impairment","renal insufficiency","renal stones"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "respiratory",
        "terms_to_find": ["laryngitis","rhinitis","sinusitis","bronchiolitis","bronchitis","cold","congestion","cough","croup","flu","grippe","influenza","penumonia","pneumonia","pneumonitis","pulmonary infection","respiratory infection","respiratory tract infection","rhinorrhea","running nose","runny nose","tuberculosis","urti","asthma","bronchospasm","COPD","difficulty breathing","dyspnea","emphysema","hemoptysis","lung disease","nebulization","nebulizing","pulmonary disease","pulmonary fibrosis","shortness of breath","sneezing"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "infection",
        "terms_to_find": ["bronchiolitis","bronchitis","cold","congestion","cough","croup","flu","grippe","influenza","penumonia","pneumonia","pneumonitis","pulmonary infection","respiratory infection","respiratory tract infection","rhinorrhea","running nose","runny nose","tuberculosis","urti"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "wound",
        "terms_to_find": ["dressing change"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "animal_insect_bite",
        "terms_to_find": ["bite","sting","stinging"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    }
]

In [61]:
# Store the rows we change here
# so that we can update the table

update_data = []

# Iterate through the in-memory representation
for rec in data:
    # Create a placeholder update record
    update_rec = {'id':rec['id']}
    
    # A flag we'll use to determine whether the record needs to be updated
    update_record = False
    
    # Get the human_translate value from the record
    ht = rec['human_translate']
    
    # If it is not None, then convert it to lowercase
    if ht:
        ht = ht.lower()
    
    # Get the google_translate value and convert it to lowercase
    # We are not currently using this to generate flags so it is commented out
    # but you could substitute it in below or write additional code if you want
    # to use it for flag generation
    gt = rec['google_translate_feb']
    if gt:
        gt = gt.lower()
    
    # Look at google_tokens_joined field
    gtj = rec['google_tokens_joined']
    if gtj:
        gtj = gtj.lower()
    
    
    # Walk through the different flag types from above and check whether the 
    # human_translate value matches for that flag. If so, create the update record
    # for that flag and then mark our update boolean indicator true so that we know
    # to update the appropriate record in the database. All records that will be updated
    # have their update record put into the update_data list.
    for term in flag_terms:
        if (ht and term in ht) or (gt and term in gt) or (gtj and term in gtj):
            update_rec["flag_" + "_".join(term.replace("-","_").split())] = 1
            update_record = True

    for tup in multiple_flag_terms:
        if (ht and all(x in ht for x in tup)) or (gt and all(x in gt for x in tup)) or (gtj and all(x in gtj for x in tup)):
            update_rec["flag_" + "_and_".join(tup)] = 1
            update_record = True

    for tup in synonym_flag_terms:
        if (ht and any(x in ht for x in tup)) or (gt and any(x in gt for x in tup)) or (gtj and any(x in gtj for x in tup)):
            update_rec["flag_" + "_".join(tup[0].split())] = 1
            update_record = True

    for tup in complex_flag_terms:
        if (ht and tup[0] in ht and not any(x in ht for x in tup[1:])) or (gt and tup[0] in gt and not any(x in gt for x in tup[1:])) or (gtj and tup[0] in gtj and not any(x in gtj for x in tup[1:])):
            update_rec["flag_" + tup[0].replace(" ","_").replace("-","_")] = 1
            update_record = True

    # complex_set_flag_terms
    for rule in complex_set_flag_terms:
        flag_name = "flag_" + "_".join(rule['flag_name'].split())

        # Continue because we already set this flag
        if flag_name in update_rec.keys():
            if update_rec[flag_name] == 1:
                continue

        if "ht" in rule['check']:
            if ht and any(x in ht for x in rule["terms_to_find"]) and not any(x in ht for x in rule["terms_to_avoid"]):
                update_rec[flag_name] = 1
                update_record = True
                # We set the flag so stop searching
                continue

        if "gt" in rule['check']:
            if gt and any(x in gt for x in rule["terms_to_find"]) and not any(x in gt for x in rule["terms_to_avoid"]):
                update_rec[flag_name] = 1
                update_record = True
                # We set the flag so stop searching
                continue

            if gtj and any(x in gtj for x in rule["terms_to_find"]) and not any(x in gtj for x in rule["terms_to_avoid"]):
                update_rec[flag_name] = 1
                update_record = True
                # We set the flag so stop searching
                continue
            
    # Handle war-related separately. This very likely can be improved upon
    if ht and 'war-related injury' in ht and 'not war-related injury' not in ht:
        update_rec['flag_conflict_related'] = 1
        update_record = True
    
    # If we created any flags, update_record is true so put this record in the list 
    # of records to update.
    if update_record:
        # Create comprehensive injury flag per Ranya's request
        keys = update_rec.keys()
        if ('flag_injury' in keys and update_rec['flag_injury'] == 1) or ('flag_wound' in keys and update_rec['flag_wound'] == 1):
            update_rec['flag_comprehensive_injury'] = 1
        else:
            update_rec['flag_comprehensive_injury'] = 0
                
        update_data.append(update_rec)

In [62]:
# How many records are we going to update in the arabic_values table?
len(update_data)

123791

In [63]:
# What do the update records look like? 
update_data[-10:]

[{'flag_comprehensive_injury': 0,
  'flag_infection': 1,
  'flag_respiratory': 1,
  'id': 441576},
 {'flag_comprehensive_injury': 0,
  'flag_infection': 1,
  'flag_respiratory': 1,
  'id': 441616},
 {'flag_comprehensive_injury': 0, 'flag_head': 1, 'id': 441624},
 {'flag_comprehensive_injury': 0, 'flag_derm': 1, 'id': 441649},
 {'flag_comprehensive_injury': 0, 'flag_derm': 1, 'id': 441651},
 {'flag_comprehensive_injury': 0,
  'flag_nerve': 1,
  'flag_neuro_complaint': 1,
  'id': 441666},
 {'flag_comprehensive_injury': 0, 'flag_derm': 1, 'id': 441673},
 {'flag_comprehensive_injury': 0, 'flag_other_infection': 1, 'id': 441715},
 {'flag_comprehensive_injury': 0, 'flag_hyperlipidemia': 1, 'id': 441726},
 {'flag_comprehensive_injury': 1, 'flag_injury': 1, 'id': 441781}]

In [64]:
# Update the arabic_values table with the update_records' data
# 1. Create the columns we need
# 2. Bulk update for each column

flag_cols = set()
for rec in update_data:
    for k in rec.keys():
        if k != 'id':
            flag_cols.add(k)
flag_cols = sorted(list(flag_cols))

# The trick here is to get the id from a record in arabic values and update that
# record with a None value for each of these flags - that will cause dataset to generate the columns
ref_rec = tab_arabic_values.find_one()
ref_rec_update = {'id':ref_rec['id']}
for col in flag_cols:
    ref_rec_update[col] = None
tab_arabic_values.update(ref_rec_update, ['id'])

# At this point maybe open DB Browser for SQLite to make sure the columns were created.
# The 1 that prints below is the number of records updated.

1

In [65]:
# Now iterate through the flag cols and create a list of each record that needs to set the value for each
# flag column and then bulk update. It is orders of magnitude faster to do it this way than one by one.

# Note - this is generating and executing some super gnarly long SQL queries with tons of ID numbers

for col in flag_cols:
    recs_to_update = []
    for rec in update_data:
        if col in rec.keys():
            recs_to_update.append(rec['id'])
    recs_to_update = sorted(recs_to_update)

    db.query("""
    UPDATE arabic_values
    SET """ + col + """ = 1 
    WHERE id IN (""" + ",".join([str(a) for a in recs_to_update]) +""");
    """)
    
# After this runs, check in the database against to make sure the flags were properly applied.

In [66]:
# Get a new db connection again in case the schema has changed.
# This probably isn't necessary but is a safety measure.

try:
    del db
    del tab_arabic_values
except:
    pass

db = dataset.connect("sqlite:///" + new_db_name)
tab_arabic_values = db['arabic_values']

In [67]:
# Get a reference to the raw Arabic data table
tab_raw_ar = db['full_raw_scrubbed']

In [68]:
# Get the list of variables used in full_raw_scrubbed and full_raw_english
rec_raw = tab_raw_ar.find_one()
variables = list(rec_raw.keys())
print(",".join(variables))

# Due to previous work, there are flag columns in the full_raw_scrubbed table, but we will ignore them
# because they aren't used in this flag-generation methodology. 

id,a_file_id,a_files_sheets_id,a_sheet_id,acceptance_pattern,analysis,analysis_request,analysis_type,anesthesia_type,assign_method,case,category,center,clinic,clinical_case,col_1,col_2,col_3,col_4,col_5,col_6,col_misc,col_moawak,col_none,col_null,col_to,conflict_related,consultations,daily_number,data_validation,date,date_first_exam,death,death_cause,death_certificate,death_date,death_location,death_time,department,diagnosis,diagnosis_confirmed,discharge,discharge_date,discharge_status,discharged_to,disclaimers,disease,displaced,displacement_duration,dose,drug_class,er,events,exam_type,examination_1,examination_2,examination_3,examination_4,examination_type,facility_type,housing,housing_persons_number,image,image_request,image_type,import_status,info_age,info_card_number,info_card_type,info_care_type,info_geo_address,info_geo_area,info_geo_community,info_geo_country_of_origin,info_geo_district,info_geo_governorate,info_geo_injury_city,info_geo_injury_site,info_geo_injury_state,info_geo

In [69]:
# Create the in-memory arabic_values lookup
# This time, since we created the flags, they'll be in the records

arabic_lookup = {}
arabic_values = [x for x in tab_arabic_values.find()]

for v in arabic_values:
    arabic_lookup[v['arabic']] = v

In [70]:
# Let's test that a value we pull out of the database has a hit in the lookup table.
test_rec = tab_raw_ar.find_one()
diagnosis = test_rec['diagnosis']
print(diagnosis)
print("---------------------- Lookup result below")
print(arabic_lookup[diagnosis])

التهاب مجاري تنفسية سفلى, التهاب قصبات
---------------------- Lookup result below
OrderedDict([('id', 3863), ('arabic', 'التهاب مجاري تنفسية سفلى, التهاب قصبات'), ('google_translate', 'Inflammation of lower respiratory tracts, bronchitis'), ('human_translate', 'bronchiolitis, bronchitis'), ('normalized', None), ('appears_in', "['diagnosis']"), ('google_translate_feb', 'Inflammation of lower respiratory tracts, bronchitis'), ('google_tokens_joined', 'lower respiratory tract infection, bronchitis'), ('orig_value', None), ('added', None), ('flag_ENT', None), ('flag_abdomen', None), ('flag_abdominal_pain', None), ('flag_allergy', None), ('flag_anemia', None), ('flag_animal_insect_bite', None), ('flag_back', None), ('flag_blast', None), ('flag_bleed', None), ('flag_blunt', None), ('flag_burn', None), ('flag_cancer', None), ('flag_cardiovascular', None), ('flag_chest', None), ('flag_complication', None), ('flag_comprehensive_injury', '1'), ('flag_conflict_related', None), ('flag_congenital',

In [71]:
# The insert_many method inserts in chunks of 1000, but this specifies that we don't want
# to start the process until we have this many records to insert.
buffer_size = 50000

flags_to_insert = []

try:
    db['full_raw_flags'].drop()
except:
    pass

tab_raw_flags = db['full_raw_flags']

# Insert a dummy record to create the table
dummy_record = {
    'file_id':None,
    'files_sheets_id':None,
    'sheet_id':None
}

for flag in flag_cols:
    dummy_record[flag] = None
    
tab_raw_flags.insert(dummy_record)
print(tab_raw_flags.count())
tab_raw_flags.delete()


# Iterate through the raw records one by one
for rec in tab_raw_ar.find():
    
    # Include foreign keys that allow us to query against the flag table instead of 
    # joining with the raw data table, which is slow.
    flag_record = {
        'id':rec['id'],
        'file_id':rec['a_file_id'],
        'files_sheets_id':rec['a_files_sheets_id'],
        'sheet_id':rec['a_sheet_id']
    }
    
    # Initialize each flag_record
    for flag in flag_cols:
        flag_record[flag] = None
        
    # Scan the conflict related column for values, but do this before looking at the
    # corresponding Arabic values so that we don't overwrite the Arabic value setting.
    if rec['conflict_related'] is not None:
        if rec['conflict_related'].strip() == 'كبرى' or rec['conflict_related'].strip() =='كبرى':
            flag_record['flag_conflict_related'] = 1
        elif rec['conflict_related'].strip() == 'لا':
            flag_record['flag_conflict_related'] = 0
        else:
            flag_record['flag_conflict_related'] = None
    else:
        flag_record['flag_conflict_related'] = None
        
    # Loop through the variables for each raw data record
    for v in variables:
        # These are obfuscated PII cols, or the flag columns we're ignoring, so skip them
        if 'info_' in v or 'flag_' in v or v == 'id':
            continue
        
        # Get the value in the column
        to_lookup = rec[v]
        
        if to_lookup is None or to_lookup == '.':
            continue
        else:
            
            # We have a legit value, so look it up and grab the flags
            try:
                # There might be a keyerror on the info_ columns' hashed values, etc.
                # I also manually removed some PII from arabic_values, so that might
                # cause an occassional mismatch.
                arabic_values_rec = arabic_lookup[to_lookup]
                for flag in flag_cols:
                    # Should be None if not flagged, so just check for existence
                    if arabic_values_rec[flag]:
                        flag_record[flag] = arabic_values_rec[flag]
            except:
                pass
    
    # Store the record
    flags_to_insert.append(flag_record)

    # Check if we need to insert
    if len(flags_to_insert) > buffer_size:
        tab_raw_flags.insert_many(flags_to_insert)
        
        # Clear the buffer
        flags_to_insert.clear()
        
# We've been through all raw records so make sure the buffer is clear
tab_raw_flags.insert_many(flags_to_insert)
flags_to_insert.clear()

1


The `flag_comprehensive_injury` logic seems possibly incorrect, so run a SQL command to set it as Ranya suggested it be set.

In [76]:
# First set all records to NULL
res = db.query("""
    UPDATE full_raw_flags SET flag_comprehensive_injury = NULL;
""")

In [77]:
# Set it equal to 1 if flag_injury == 1 or flag_wound == 1
res = db.query("""
    UPDATE full_raw_flags SET flag_comprehensive_injury = 1 WHERE flag_injury = 1 OR flag_wound = 1;
""")

## Export Data

In [82]:
res = db.query("""
    UPDATE files SET problem_opening = 0 WHERE problem_opening IS NULL;
""")
res = db.query("""
    UPDATE files SET skipped = 0 WHERE skipped IS NULL;
""")
res = db.query("""
    UPDATE files SET ignore = 0 WHERE ignore IS NULL;
""")

res = db.query("""
    UPDATE files SET processed = 1 WHERE processed IS NULL AND added = '2018-04-08';
""")

In [83]:
# You can change this query to export a different set of data
result = db.query("""
SELECT  files.id as files_id,
        files.year,
        files.month,
        files.year || '-' || files.month || '-01' AS full_date,
        facilities.id AS facility_id,
        facilities.facility_parent_id,
        facilities.facilityname,
        facilities.country,
        facilities.governorate,
        facilities.district,
        facilities.subdistrict,
        facilities.facility_type,
        full_raw_flags.flag_abdomen,
        full_raw_flags.flag_abdominal_pain,
        full_raw_flags.flag_allergy,
        full_raw_flags.flag_anemia,
        full_raw_flags.flag_animal_insect_bite,
        full_raw_flags.flag_back,
        full_raw_flags.flag_blast,
        full_raw_flags.flag_bleed,
        full_raw_flags.flag_blunt,
        full_raw_flags.flag_burn,
        full_raw_flags.flag_cancer,
        full_raw_flags.flag_cardiovascular,
        full_raw_flags.flag_chest,
        full_raw_flags.flag_complication,
        full_raw_flags.flag_conflict_related,
        full_raw_flags.flag_congenital,
        full_raw_flags.flag_constipation,
        full_raw_flags.flag_dehydration,
        full_raw_flags.flag_dental_complaint,
        full_raw_flags.flag_derm,
        full_raw_flags.flag_diabetes,
        full_raw_flags.flag_diarrhea_dysentery,
        full_raw_flags.flag_endocrine,
        full_raw_flags.flag_ENT,
        full_raw_flags.flag_explosive,
        full_raw_flags.flag_eye,
        full_raw_flags.flag_facial,
        full_raw_flags.flag_fatigue,
        full_raw_flags.flag_fever,
        full_raw_flags.flag_follow_up,
        full_raw_flags.flag_fracture,
        full_raw_flags.flag_gi_complaint,
        full_raw_flags.flag_growth_delay,
        full_raw_flags.flag_gu,
        full_raw_flags.flag_gunshot,
        full_raw_flags.flag_gyn_women,
        full_raw_flags.flag_head,
        full_raw_flags.flag_headache,
        full_raw_flags.flag_history_of,
        full_raw_flags.flag_hyperlipidemia,
        full_raw_flags.flag_infection,
        full_raw_flags.flag_injury,
        full_raw_flags.flag_injury_neuro,
        full_raw_flags.flag_liver_dysfunction,
        full_raw_flags.flag_lower_extremity,
        full_raw_flags.flag_malnutrition,
        full_raw_flags.flag_mental_health,
        full_raw_flags.flag_musculoskeletal_pain,
        full_raw_flags.flag_nausea_vomiting,
        full_raw_flags.flag_neck,
        full_raw_flags.flag_nerve,
        full_raw_flags.flag_neuro_complaint,
        full_raw_flags.flag_neurologic,
        full_raw_flags.flag_orthopedic,
        full_raw_flags.flag_other_infection,
        full_raw_flags.flag_pain,
        full_raw_flags.flag_pelvic,
        full_raw_flags.flag_pregnancy,
        full_raw_flags.flag_renal,
        full_raw_flags.flag_respiratory,
        full_raw_flags.flag_shrapnel,
        full_raw_flags.flag_spinal,
        full_raw_flags.flag_spine,
        full_raw_flags.flag_stab,
        full_raw_flags.flag_stroke,
        full_raw_flags.flag_suspected,
        full_raw_flags.flag_traffic_accident,
        full_raw_flags.flag_trauma,
        full_raw_flags.flag_upper_extremity,
        full_raw_flags.flag_urologic,
        full_raw_flags.flag_vascular,
        full_raw_flags.flag_wound,
        full_raw_flags.flag_comprehensive_injury

FROM full_raw_flags
JOIN files on files.id = full_raw_flags.file_id
JOIN facilities on files.facility_id = facilities.id

WHERE files.facility_id IS NOT NULL 
AND files.month IS NOT NULL
AND files.skipped = 0
AND files.ignore = 0;
""")

# This used to be a part of dataset but was extracted to its own library
# https://github.com/pudo/datafreeze
freeze(result, format='csv', filename='full_raw_flags.csv')

### Save DB

In [84]:
# This is optional and will generate a copy of the database that will be gigabytes in size.
shutil.copy2(new_db_name,'sams_data_phase24_output_2018-04-08.sqlite')

'sams_data_phase24_output_2018-04-08.sqlite'