In [1]:
# Manipulate the file system
import os
import shutil

# Copy dictionaries
import copy

# Regular expressions
import re

# Display errors in realtime
import ipywidgets as widgets

# work with dates
import datetime
import arrow
import time

# For scrubbing PII
import hashlib

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import sqlalchemy
import dataset

# Enables opening and reading of Excel files
import openpyxl

# Import Pandas for easy importing of an excel file
import csv

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase17.sqlite")
    print("Removed template clone sams_data_phase17.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase17_template.sqlite","sams_data_phase17.sqlite")
    
    print("Created database from template: sams_data_phase17.sqlite")
except:
    pass

Removed template clone sams_data_phase17.sqlite
Created database from template: sams_data_phase17.sqlite


In [3]:
db = dataset.connect("sqlite:///sams_data_phase17.sqlite")

In [4]:
tab_arabic_values = db['arabic_values']
data = [x for x in tab_arabic_values]

In [5]:
# Require and flag term
flag_terms = [
    "injury",
    "blunt",
    "trauma",
    "shrapnel",
    "traffic",
    "explosive",
    "blast",
    "gunshot",
    "stab",
    "wound",
    "upper extremity",
    "lower extremity",
    "neck",
    "chest",
    "back",
    "spinal",
    "neurologic",
    "nerve",
    "vascular",
    "orthopedic",
    "fracture",
    "suspected",
    "follow-up",
    "complication",
    "history of"
]

# require all terms - not in use at the moment
multiple_flag_terms = [
#     ("burn","fracture")
]

# require any of the terms but name the flag after the first
synonym_flag_terms = [
    ("facial","face"),
    ("pelvic","pelvis"),
    ("head","eye","ear","face","brain","scalp","mouth","nose"),
    ("spine","spinal"),
    ("abdomen","abdominal")
]

# require the first term and the absence of the remaining terms
# name the flag after the first term.
complex_flag_terms = [
    ("urologic","neurologic"),
    ("burn","heartburn"),
    ("extremity","lower extremity","upper extremity")
]

In [6]:
# Testing complex_flag_terms

# test_phrase1 = "i think i have heartburn"
# test_phrase2 = 'i burned my hand'

# if not any(x in test_phrase2 for x in complex_flag_terms[1][1:]):
#     print("add flag")
# else:
#     print("do not add flag")

In [7]:
# Testing multiple_flag_terms

# test_phrase3 = "fracture and burn and diarrhea"
# test_phrase4 = "fracture and diarrhea"

# if all(x in test_phrase4 for x in multiple_flag_terms[0]):
#     print("add flag")
# else:
#     print("do not add flag")

In [8]:
update_data = []

for rec in data:
    update_rec = {'id':rec['id']}
    update_record = False
    
    ht = rec['human_translate']
    
    if ht:
        ht = ht.lower()
    
    gt = rec['google_translate'].lower()
    
    if ht:
        for term in flag_terms:
            if term in ht:
                update_rec["flag_" + "_".join(term.replace("-","_").split())] = 1
                update_record = True
                
        for tup in multiple_flag_terms:
            if all(x in ht for x in tup):
                update_rec["flag_" + "_and_".join(tup)] = 1
                update_record = True
                
        for tup in synonym_flag_terms:
            if any(x in ht for x in tup):
                update_rec["flag_" + tup[0]] = 1
                update_record = True
                
        for tup in complex_flag_terms:
            if tup[0] in ht and not any(x in ht for x in tup[1:]):
                update_rec["flag_" + tup[0].replace(" ","_").replace("-","_")] = 1
                update_record = True
            
        if 'war-related injury' in ht and 'not war-related injury' not in ht:
            update_rec['flag_conflict_related'] = 1
            update_record = True
        
    if update_record:
        update_data.append(update_rec)
    

In [9]:
# The number of records in arabic_values that will be updated
len(update_data)

33431

In [11]:
# What the update records look like
update_data[:25]

[{'flag_abdomen': 1, 'id': 606},
 {'flag_head': 1, 'id': 613},
 {'flag_conflict_related': 1, 'flag_injury': 1, 'id': 614},
 {'flag_suspected': 1, 'id': 619},
 {'flag_head': 1, 'id': 628},
 {'flag_chest': 1, 'id': 636},
 {'flag_head': 1, 'id': 642},
 {'flag_suspected': 1, 'id': 643},
 {'flag_suspected': 1, 'id': 665},
 {'flag_suspected': 1, 'id': 667},
 {'flag_fracture': 1, 'flag_lower_extremity': 1, 'id': 670},
 {'flag_head': 1, 'id': 672},
 {'flag_wound': 1, 'id': 676},
 {'flag_back': 1, 'id': 685},
 {'flag_suspected': 1, 'id': 687},
 {'flag_suspected': 1, 'id': 690},
 {'flag_head': 1, 'flag_injury': 1, 'flag_shrapnel': 1, 'id': 699},
 {'flag_complication': 1, 'flag_lower_extremity': 1, 'id': 700},
 {'flag_injury': 1,
  'flag_neurologic': 1,
  'flag_upper_extremity': 1,
  'id': 701},
 {'flag_head': 1, 'id': 702},
 {'flag_head': 1, 'flag_injury': 1, 'flag_shrapnel': 1, 'id': 703},
 {'flag_head': 1,
  'flag_injury': 1,
  'flag_neurologic': 1,
  'flag_shrapnel': 1,
  'flag_upper_extremit

In [12]:
# Update the arabic_values table
for rec in update_data:
    tab_arabic_values.update(rec, ['id'])

----

### Apply flags to raw data

1. Pseudo-update the first record to trigger the addition of the flag fields
2. Compare each value in each column to an in-memory arabic_values lookup
3. Apply relevant flags to the record in question
4. Buffer and update the raw table when the buffer is full.

In [13]:
# Create flag names

flag_names = []

for term in flag_terms:
    flag_names.append("flag_" + "_".join(term.replace("-","_").split()))
    
for tup in multiple_flag_terms:
    flag_names.append("flag_" + "_and_".join(tup))
    
for tup in synonym_flag_terms:
    flag_names.append("flag_" + tup[0])
    
for tup in complex_flag_terms:
    flag_names.append("flag_" + tup[0].replace(" ","_").replace("-","_"))
    
# Handle the cases added not in the term lists
flag_names.append("flag_conflict_related")

flag_names = sorted(flag_names)
flag_names

['flag_abdomen',
 'flag_back',
 'flag_blast',
 'flag_blunt',
 'flag_burn',
 'flag_chest',
 'flag_complication',
 'flag_conflict_related',
 'flag_explosive',
 'flag_extremity',
 'flag_facial',
 'flag_follow_up',
 'flag_fracture',
 'flag_gunshot',
 'flag_head',
 'flag_history_of',
 'flag_injury',
 'flag_lower_extremity',
 'flag_neck',
 'flag_nerve',
 'flag_neurologic',
 'flag_orthopedic',
 'flag_pelvic',
 'flag_shrapnel',
 'flag_spinal',
 'flag_spine',
 'flag_stab',
 'flag_suspected',
 'flag_traffic',
 'flag_trauma',
 'flag_upper_extremity',
 'flag_urologic',
 'flag_vascular',
 'flag_wound']

In [14]:
tab_raw_ar = db['full_raw_scrubbed']
tab_raw_en = db['full_raw_english']

In [15]:
# Add flag columns to the raw Arabic data
rec_raw = tab_raw_ar.find_one()
update_rec = {'id':rec_raw['id']}
for name in flag_names:
    update_rec[name] = None
tab_raw_ar.update(update_rec,['id'])

1

In [16]:
# Add flag columns to the raw English data
rec_raw = tab_raw_en.find_one()
update_rec = {'id':rec_raw['id']}
for name in flag_names:
    update_rec[name] = None
tab_raw_en.update(update_rec,['id'])

1

In [17]:
# Get the list of variables used in full_raw_scrubbed and full_raw_english
rec_raw = tab_raw_ar.find_one()
variables = list(rec_raw.keys())

In [18]:
# Create the in-memory arabic_values lookup
arabic_lookup = {}

arabic_values = [x for x in tab_arabic_values.find()]

for v in arabic_values:
    arabic_lookup[v['arabic']] = v

In [19]:
reduced_variables = ["diagnosis"]

In [20]:
buffer_size = 1000

flags_to_insert = []
flags_to_insert_reduced = []

try:
    tab_raw_flags.drop()
    tab_raw_flags_reduced.drop()
except:
    pass

tab_raw_flags = db['full_raw_flags']
tab_raw_flags_reduced = db['full_raw_flags_reduced']

for rec in tab_raw_ar.find():
    
    # Include foreign keys that allow us to query against the flag table instead of 
    # joining with the raw data table, which is slow.
    flag_record = {
        'id':rec['id'],
        'file_id':rec['a_file_id'],
        'files_sheets_id':rec['a_files_sheets_id'],
        'sheet_id':rec['a_sheet_id']
    }
    
    flag_record_reduced = {
        'id':rec['id'],
        'file_id':rec['a_file_id'],
        'files_sheets_id':rec['a_files_sheets_id'],
        'sheet_id':rec['a_sheet_id']
    }
    
    # Initialize each flag_record
    for flag in flag_names:
        flag_record[flag] = None
        flag_record_reduced[flag] = None
        
    # Scan the conflict related column for values, but do this before looking at the
    # corresponding Arabic values so that we don't overwrite the Arabic value setting.
    if rec['conflict_related'] is not None:
        if rec['conflict_related'].strip() == 'كبرى' or rec['conflict_related'].strip() =='كبرى':
            flag_record_reduced['flag_conflict_related'] = 1
            flag_record['flag_conflict_related'] = 1
        elif rec['conflict_related'].strip() == 'لا':
            flag_record_reduced['flag_conflict_related'] = 0
            flag_record['flag_conflict_related'] = 0
        else:
            flag_record_reduced['flag_conflict_related'] = None
            flag_record['flag_conflict_related'] = None
    else:
        flag_record_reduced['flag_conflict_related'] = None
        flag_record['flag_conflict_related'] = None
        
    # Loop through the variables for each raw data record
    for v in variables:
        # These are PII cols, so skip them
        if 'info_' in v or v == 'id':
            continue
        
        # Get the value in the column
        to_lookup = rec[v]
        
        if to_lookup is None or to_lookup == '.':
            continue
        else:
            
            # We have a legit value, so look it up and grab the flags
            try:
                # There might be a keyerror on the info_ columns' hashed values, etc.
                arabic_values_rec = arabic_lookup[to_lookup]
                for flag in flag_names:
                    if arabic_values_rec[flag] == 1:
                        flag_record[flag] = arabic_values_rec[flag]
            except:
                pass
            
    # Loop through the variables for each raw data record
    for v in reduced_variables:
        # These are PII cols, so skip them
        if 'info_' in v or v == 'id':
            continue
        
        # Get the value in the column
        to_lookup = rec[v]
        
        if to_lookup is None or to_lookup == '.':
            continue
        else:
            
            # We have a legit value, so look it up and grab the flags
            try:
                # There might be a keyerror on the info_ columns' hashed values, etc.
                arabic_values_rec = arabic_lookup[to_lookup]
                for flag in flag_names:
                    if arabic_values_rec[flag] == 1:
                        flag_record_reduced[flag] = arabic_values_rec[flag]
            except:
                pass
    
    # Store the record
    flags_to_insert.append(flag_record)
    flags_to_insert_reduced.append(flag_record)
    
    # Check if we need to insert
    if len(flags_to_insert) > buffer_size:
        tab_raw_flags.insert_many(flags_to_insert)
        
        # Clear the buffer
        flags_to_insert.clear()
        
    # Check if we need to insert
    if len(flags_to_insert_reduced) > buffer_size:
        tab_raw_flags_reduced.insert_many(flags_to_insert_reduced)
        
        # Clear the buffer
        flags_to_insert_reduced.clear()
        
# We've been through all raw records so make sure the buffer is clear
tab_raw_flags.insert_many(flags_to_insert)
tab_raw_flags_reduced.insert_many(flags_to_insert_reduced)
flags_to_insert.clear()
flags_to_insert_reduced.clear()
            

Conversion to a format for analysis...

```
SELECT files.id as files_id,
       files.year,
	   files.month,
       facilities.id AS facility_id,
       facilities.facility_parent_id,
       facilities.facilityname,
	   facilities.country,
	   facilities.governorate,
	   facilities.district,
	   facilities.subdistrict,
	   facilities.facility_type,
	   SUM(full_raw_flags.flag_burn) AS sum_flag_burn,
	   SUM(full_raw_flags.flag_diarrhea) AS sum_flag_diarrhea,
	   SUM(full_raw_flags.flag_followup) AS sum_flag_followup,
	   SUM(full_raw_flags.flag_fracture) AS sum_flag_fracture,
	   SUM(full_raw_flags.flag_inflammation) AS sum_flag_inflammation,
	   SUM(full_raw_flags.flag_injury) AS sum_flag_injury,
	   SUM(full_raw_flags.flag_lower_extrem) AS sum_flag_lower_extrem,
	   SUM(full_raw_flags.flag_upper_extrem) AS sum_flag_upper_extrem,
	   SUM(full_raw_flags.flag_war_related) AS sum_flag_war_related
	   
FROM files
JOIN facilities on files.facility_id = facilities.id
JOIN full_raw_flags on files.id = full_raw_flags.file_id

WHERE files.facility_id IS NOT NULL 
AND files.month IS NOT NULL
AND files.skipped = 0
AND files.ignore = 0

GROUP BY files.year, 
         files.month, 
		 facility_id, 
		 facilities.facility_parent_id, 
		 facilities.facilityname, 
		 facilities.country, 
		 facilities.governorate, 
		 facilities.district, 
		 facilities.subdistrict, 
		 facilities.facility_type

ORDER BY files.year ASC, files.month ASC, facilities.facilityname ASC;
```


This is just a dump of the table for them.

```
SELECT files.id as files_id,
       files.year,
	   files.month,
       facilities.id AS facility_id,
       facilities.facility_parent_id,
       facilities.facilityname,
	   facilities.country,
	   facilities.governorate,
	   facilities.district,
	   facilities.subdistrict,
	   facilities.facility_type,
	   full_raw_flags.flag_burn,
	   full_raw_flags.flag_diarrhea,
	   full_raw_flags.flag_followup,
	   full_raw_flags.flag_fracture,
	   full_raw_flags.flag_inflammation,
	   full_raw_flags.flag_injury,
	   full_raw_flags.flag_lower_extrem,
	   full_raw_flags.flag_upper_extrem,
	   full_raw_flags.flag_war_related
	   
FROM files
JOIN facilities on files.facility_id = facilities.id
JOIN full_raw_flags on files.id = full_raw_flags.file_id

WHERE files.facility_id IS NOT NULL 
AND files.month IS NOT NULL
AND files.skipped = 0
AND files.ignore = 0

ORDER BY files.year ASC, files.month ASC, facilities.facilityname ASC;
```

More SQL for generating a data set

```
SELECT files.id as files_id,
       files.year,
       files.month,
       facilities.id AS facility_id,
       facilities.facility_parent_id,
       facilities.facilityname,
       facilities.country,
       facilities.governorate,
       facilities.district,
       facilities.subdistrict,
       facilities.facility_type,
	   full_raw_flags.flag_abdomen,
	   full_raw_flags.flag_back,
	   full_raw_flags.flag_brain,
	   full_raw_flags.flag_burn,
	   full_raw_flags.flag_chest,
	   full_raw_flags.flag_complication,
	   full_raw_flags.flag_explosive,
	   full_raw_flags.flag_eye,
	   full_raw_flags.flag_facial,
	   full_raw_flags.flag_follow_up,
	   full_raw_flags.flag_fracture,
	   full_raw_flags.flag_gunshot,
	   full_raw_flags.flag_head,
	   full_raw_flags.flag_history_of,
	   full_raw_flags.flag_injury,
	   full_raw_flags.flag_lower_extremity,
	   full_raw_flags.flag_neck,
	   full_raw_flags.flag_nerve,
	   full_raw_flags.flag_neurologic,
	   full_raw_flags.flag_orthopedic,
	   full_raw_flags.flag_pelvic,
	   full_raw_flags.flag_shrapnel,
	   full_raw_flags.flag_spinal,
	   full_raw_flags.flag_suspected,
	   full_raw_flags.flag_traffic,
	   full_raw_flags.flag_trauma,
	   full_raw_flags.flag_upper_extremity,
	   full_raw_flags.flag_urologic,
	   full_raw_flags.flag_vascular
	   
FROM full_raw_flags
JOIN files on files.id = full_raw_flags.file_id
JOIN facilities on files.facility_id = facilities.id

WHERE files.facility_id IS NOT NULL 
AND files.month IS NOT NULL
AND files.skipped = 0
AND files.ignore = 0
```

In [21]:
# Copy the database over as the template for the next file.
# This Notebook did not include manual editing of the data.

# Do not rerun this cell!
shutil.copy2('sams_data_phase17.sqlite','sams_data_phase18_template.sqlite')

'sams_data_phase18_template.sqlite'