In [1]:
# Manipulate the file system
import os
import shutil
import datetime
import arrow

# Copy dictionaries
import copy

# Convert stored string representation of a list to a list
import ast

# Recurse through a directory tree and return file names with glob
import glob

# Decode and re-encode mangled Arabic file names
import codecs

# Connect to a SQLite database in a lazy manner.
import dataset
import sqlalchemy

# Enables opening and reading of Excel files
import openpyxl

# Translating variables, sheet names, and workbook names from Arabic
# This is NOT free to use.
from google.cloud import translate

# Set the environment variable for the Google Service Account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\clay\\Documents\\fxb-lcs-2b24f4f8a73a.json'

In [2]:
#If there's an existing db for this sheet, delete it
#so that we can copy from the template for a fresh start

try:
    os.remove("sams_data_phase22.sqlite")
    print("Removed template clone sams_data_phase22.sqlite")
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2("sams_data_phase22_template.sqlite","sams_data_phase22.sqlite")
    
    print("Created database from template: sams_data_phase22.sqlite")
except:
    pass

Removed template clone sams_data_phase22.sqlite
Created database from template: sams_data_phase22.sqlite


In [3]:
db = dataset.connect("sqlite:///sams_data_phase22.sqlite")
tab_files = db['files']

In [4]:
# Get a list of the files to process
file_list = glob.glob("SAMSData_Missing/*.xls*",recursive=True)
len(file_list)

50

### Before we can process the files, we need to have the latest facilities in the database an in a list.

After review with Ranya, most facilities should be accounted for in the current `facilities` table. Edits were made to an export of it and can be found at the following link, but will not be imported for now.

https://docs.google.com/spreadsheets/d/1UoRB3AGCP1Domphie1rWbU2Uq5vSEb9A1JeGFbdGhh8/edit?usp=sharing

In [5]:
# Get a reference to the facilities
tab_facilities = db['facilities']

facility_lookup = {}

for rec in tab_facilities.find():
    facility_lookup[rec['facility_code']] = rec['id']

In [6]:
for f in file_list:
    try:
        wb = openpyxl.load_workbook(f,read_only=True)
        problem = False
        sheets = wb.sheetnames
        num_sheets = len(sheets)
    except:
        print("Unable to load",f)
        problem = True
        sheets = []
        num_sheets = 0
        
    path = f
    filename = f.split("\\")[-1]
    
    # Stub this in later
    country = None
    
    year = filename.split("-")[1]
    month = filename.split("-")[2].split(".")[0]
    facility_code = filename.split("-")[0]
    facility_id = None
    
    try:
        facility_id = facility_lookup[facility_code]
    except:
        pass    
    
    skipped = False
    ignore = False
    
    ungarbled = None
    translation = None
    
    file_rec = {
        "file_name":filename,
        "ungarbled":ungarbled,
        "translation":translation,
        "path":path,
        "country":country,
        "year":year,
        "month":month,
        "num_sheets":num_sheets,
        "sheet_names":str(sheets),
        "info":"",
        "problem_opening":problem,
        "skipped":skipped,
        "ignore":ignore,
        "facility_id":facility_id,
        "added":"2018-04-03",
        "processed":0
    }
    
    # Insert the file record into the database
    tab_files.insert(file_rec)

In [7]:
# All good to this point - now need to break out the files and try to import the data.

### Files from May 2015

Process May 2015 first and mark the files as processed when complete. These files all share a similar schema and should be importable with old code.

In [8]:
new_files = [rec for rec in db.query("SELECT * FROM files WHERE added = '2018-04-03' AND sheet_names NOT LIKE '%M1%' AND sheet_names NOT LIKE '%M3%';")]

tab_sheets = db['sheets']
tab_files_sheets_join = db['files_sheets']

sheets_lookup = {rec['name']:rec['id'] for rec in tab_sheets.find()}

sheet_names_to_skip = [
    "TOTAL",
    "Name",
    "Code",
    "Sheet",
    "Monthly",
    "Injured Info",
    "statstics_weekly", # intentional misspelling
    "Import SheetHIS",
    "statstics_DailyHIS"
]

In [9]:
sheet_set = set()

for rec in new_files:
    sheets = ast.literal_eval(rec['sheet_names'])
    for s in sheets:
        sheet_set.add(s)

In [10]:
# Google Translate client
translate_client = translate.Client()
target_lang = 'en'

In [11]:
for sheet_name in sheet_set:
    if sheet_name not in sheets_lookup.keys():
        new_rec = {"name":sheet_name,"added":"2018-04-03"}
        translation = translate_client.translate(sheet_name, target_language=target_lang)
        translation = translation['translatedText']
        new_rec["translation"] = translation
        new_rec["normalized"] = ""
        skip = False

        if any(skipname in sheet_name for skipname in sheet_names_to_skip):
            skip = True
        new_rec["skip"] = skip
        tab_sheets.insert(new_rec)
        print(new_rec)

{'name': 'statstics_weekly', 'added': '2018-04-03', 'translation': 'statstics_weekly', 'normalized': '', 'skip': True}
{'name': 'lists', 'added': '2018-04-03', 'translation': 'lists', 'normalized': '', 'skip': False}
{'name': 'register', 'added': '2018-04-03', 'translation': 'register', 'normalized': '', 'skip': False}
{'name': 'Import SheetHIS', 'added': '2018-04-03', 'translation': 'Import SheetHIS', 'normalized': '', 'skip': True}
{'name': 'لشمانيا', 'added': '2018-04-03', 'translation': 'Leishmania', 'normalized': '', 'skip': False}
{'name': 'statstics_DailyHIS', 'added': '2018-04-03', 'translation': 'statstics_DailyHIS', 'normalized': '', 'skip': True}
{'name': 'السنية', 'added': '2018-04-03', 'translation': 'Sunni conflict', 'normalized': '', 'skip': False}


We created `sheet` records for sheets that did not previously appear in the table.

Next, create the join records so that we can know what we've processed. These are sheet instances.

In [12]:
sheets_lookup = {rec['name']:rec['id'] for rec in tab_sheets.find()}

join_records = []

for rec in new_files:
    for sheet in ast.literal_eval(rec['sheet_names']):
        sheet_id = sheets_lookup[sheet]
        join_rec = {
            "file_id":rec['id'],
            "sheet_id":sheet_id,
            "header_start":None,
            "header_end":None,
            "header_values":None,
            "added":"2018-04-03"
        }
        join_records.append(join_rec)

# Bulk inserts are faster than individual inserts
tab_files_sheets_join.insert_many(join_records)

Next, locate the headers in the sheets in question. Note that it skips sheets we can't work with.

In [13]:
recs_to_process = db.query("""
SELECT files_sheets.id AS files_sheets_id, files_sheets.file_id, files_sheets.sheet_id, files.path AS file_path,sheets.name AS sheet_name
FROM files_sheets
JOIN files ON files_sheets.file_id = files.id
JOIN sheets ON files_sheets.sheet_id = sheets.id
WHERE sheets.skip = 0
  AND files_sheets.added = '2018-04-03'
ORDER BY file_id, sheet_id;
""")

In [14]:
def headers_from_worksheet(workbook,worksheet_name):
    worksheet = workbook.get_sheet_by_name(worksheet_name)
    winning_row_values = 0
    winning_row_number = None
    
    for row in range(1,21):
        start_range = 'A' + str(row)
        end_range = 'Z' + str(row)
        cells = worksheet[start_range:end_range]
        try:
            values = [c.value for c in cells[0]]
        except:
            continue
            
        filled_cells = 0
        for v in values:
            if v is not None:
                filled_cells += 1
        
        if filled_cells > winning_row_values:
            winning_row_values = filled_cells
            winning_row_number = row
            
    # Some sheets may be blank
    if winning_row_number is None:
        return None,None
            
    winning_start = 'A' + str(winning_row_number)
    winning_end = 'Z' + str(winning_row_number)

    header_cells = worksheet[winning_start:winning_end]
    header_data = [c.value for c in header_cells[0]]
    
    # If we detect a datetime.datetime.object, then we probably
    # want the previous row. Might be a better way to check this
    # TODO: This is a bad idea?
    
    # What's the actual start column of the header?
    start_idx = 0
    determined_start = False
    
    for val in header_data:
        if not determined_start and val is None:
            start_idx += 1
        elif val is not None:
            determined_start = True
            
        if isinstance(val,datetime.datetime):
            winning_row_number -= 1
            
    header_start_letter = letter_lookup[start_idx]
            
    winning_start = header_start_letter + str(winning_row_number)
    winning_end = 'Z' + str(winning_row_number)
    try:
        header_cells = worksheet[winning_start:winning_end]
        header_data = [c.value for c in header_cells[0]]
    except:
        return None,None
    
    end_idx = len(header_data) - 1
    problem = ''
#     print(end_idx)
#     print(header_data)
#     print(header_data[end_idx],"\n")
    
    while header_data[end_idx] is None:
        end_idx -= 1
        if end_idx <= start_idx:
            problem = ' (PROBLEM)'
            break
            
    # Lookup assumes that the header starts with col A, so offset the lookup on the
    # end letter by the start letter index and it will assign the proper letter to the
    # end letter.
    end_letter = letter_lookup[end_idx+start_idx]  
    header_end = end_letter + winning_start[1:] + problem
    header_range = (winning_start,header_end)
    
    # Prune the header_data to get rid of trailing None values
    prune_by = 0
    
    while header_data[prune_by-1] is None:
        prune_by -= 1
        
    try:
        header_data = header_data[:prune_by]
    except:
        pass
    
    return header_range, header_data

In [15]:
letter_lookup = ['A','B','C','D','E','F','G','H','I','J','K',
                 'L','M','N','O','P','Q','R','S','T','U','V',
                 'W','X','Y','Z']

In [16]:
# Rename this ref b/c the old code did
tab_files_sheets = db['files_sheets']

In [17]:
variables = set()

working_file_id = -1
active_file_path = None
active_workbook = None

for rec in recs_to_process:
    
    # This only fires with a new file_id
    if rec['file_id'] > working_file_id:
        working_file_id = rec['file_id']
        active_file_path = rec['file_path']
        try:
            active_workbook = openpyxl.load_workbook(active_file_path,read_only=True,guess_types=False,data_only=True)
        except:
            print("Unable to open",active_file_path)
            active_workbook = None
            active_file_path = None
            working_file_id = -1
            
    # Process the active file
    sheet_name = rec['sheet_name']
    header_range, header_data = headers_from_worksheet(active_workbook,sheet_name)
    
    # Unable to find a header in this sheet. Mark the record
    if header_range is None:
        update_rec = {"id":rec['files_sheets_id'],"header_start":"PROBLEM"}
        tab_files_sheets.update(update_rec,['id'])
        print(rec['files_sheets_id'],"Problem workbook",active_file_path,"--> sheet -->",sheet_name)
        continue
    else:
        header_start = header_range[0]
        header_end = header_range[1]
        
        fixed_header_data = []
        for value in header_data:
            if isinstance(value,datetime.datetime):
                fixed_value = arrow.get(value).format("YYYY-MM-DD")
                fixed_header_data.append(fixed_value)
                variables.add(fixed_value)
            else:
                fixed_header_data.append(value)
                variables.add(value)
        
        update_rec = {
            "id":rec['files_sheets_id'],
            "header_start":header_start,
            "header_end":header_end,
            "header_values":str(fixed_header_data)
        }
        
        tab_files_sheets.update(update_rec,['id'])
        

7599 Problem workbook SAMSData_Missing\SAMS101-2015-09-1.xlsx --> sheet --> week4


In [18]:
# Fix # SAMSData_Missing\SAMS183-2015-02.xlsx register --> A3:AA755
# file_id 1180 files_sheets_id 7703 sheet_id 205

# File edited to address the issues

Now we have to look at the variables that we already have in the variables table and see if any of the ones that we have from these new files are missing.

In [19]:
tab_vars = db['variables']
var_lookup = {rec['orig']:rec['id'] for rec in tab_vars.find()}

In [20]:
# Note that we create the set() variables two cells above, when looking for headers

for v in variables:
    if v not in var_lookup.keys():
        translation = translate_client.translate(v, target_language=target_lang)
        translation = translation['translatedText']
        new_var_rec = {'orig':v, 'translation':translation, 'normalized': 'new_or_followup', 'added':'2018-04-03' }
        tab_vars.insert(new_var_rec)
        print(v,translation)

weekbottom weekbottom
رمز المريض Patient Code
agemonthcat agemonthcat
نازح/ لاجىء/ غيره Displaced / Refugee / Other
جديد/مراجع New / References
Followup Followup
الاحالة Assignment
Sex Sex
Disposition Disposition
التلقيح ضد الحصبة خلال الزيارة Measles vaccination during the visit
التلقيح باللقاح الثلاثي خلال الزيارة Vaccine vaccination during the visit
عمليات Operations
death death
مكان الاقامة Place of residence
ملاريا ( شك ) Malaria (doubt)
classification classification
agebottom agebottom
Agecat Agecat
الوزن the weight
التصنيف الطبي  Medical Classification
عناية Attention
Profile Profile
agemonthb agemonthb
درجة الحرارة temperature
exit exit
التلقيح ضد شلل الاطفال خلال الزيارة Polio vaccination during the visit
الاجراء الجراحي Surgical procedure
yesno yesno
العمر بالاشهر(حديث الولادة) New Age (months of birth)
حالة جديدة/مراجعة New status / revision
العمر بالسنوات New years
intervention intervention


In [21]:
fix_vars = {
    "weekbottom": "info_age",
    "Profile": "profile",
    "agemonthcat": "info_age",
    "Disposition": "disposition",
    "intervention": "treatment",
    "agemonthb": "info_age",
    "death": "death",
    "exit": "discharge",
    "yesno": "yes_no",
    "New / References": "new_or_followup",
    "Attention": "treatment",
    "agebottom": "info_age",
    "Sex": "info_sex",
    "Malaria (doubt)": "malaria",
    "classification": "status",
    "Agecat": "info_age",
    "Operations": "surgery_type",
    "Followup": "new_or_followup"
}

for k,v in fix_vars.items():
    db.query("UPDATE variables SET normalized = '" + v + "' WHERE translation = '" + k + "';")

Manual DB editing done to help consolidate variables.

Note from last running:

Adding جديد/مراجع as id 63 because it's similar.

In [22]:
var_lookup = {rec['orig']:rec['id'] for rec in tab_vars.find()}

In [23]:
tab_files_vars = db['files_variables']
tab_sheets_vars = db['sheets_variables']
tab_files_sheets_vars = db['files_sheets_vars']

In [24]:
files_vars_set = set()
sheets_vars_set = set()
files_sheets_vars_set = set()

recs_to_process = [rec for rec in db.query("SELECT * from files_sheets WHERE added = '2018-04-03';")]

for rec in recs_to_process:
    header_values = rec['header_values']
    if header_values is None:
        continue
    
    header_values = ast.literal_eval(rec['header_values'])
    
    for header in header_values:
        if header is None:
            continue
        try:
            var_id = var_lookup[str(header)]    
            file_id = rec['file_id']
            sheet_id = rec['sheet_id']
            files_sheets_id = rec['id']
        except:
            print("problem with",header)
            continue
        
        files_vars_set.add((file_id,var_id))
        files_sheets_vars_set.add((files_sheets_id,var_id))
        sheets_vars_set.add((sheet_id,var_id))

In [25]:
tab_files_vars_recs = []

for rec_tuple in files_vars_set:
    rec = {"file_id":rec_tuple[0],"var_id":rec_tuple[1]}
    tab_files_vars_recs.append(rec)
    
tab_files_vars.insert_many(tab_files_vars_recs)

In [26]:
tab_sheets_vars_recs = []

for rec_tuple in sheets_vars_set:
    rec = {"sheet_id":rec_tuple[0],"var_id":rec_tuple[1]}
    tab_sheets_vars_recs.append(rec)
    
tab_sheets_vars.insert_many(tab_sheets_vars_recs)

In [27]:
tab_files_sheets_vars_recs = []

for rec_tuple in files_sheets_vars_set:
    rec = {"files_sheets_id":rec_tuple[0],"var_id":rec_tuple[1]}
    tab_files_sheets_vars_recs.append(rec)
    
tab_files_sheets_vars.insert_many(tab_files_sheets_vars_recs)

## On to importing the raw data

This is coming from workbook 4.

In [28]:
sheets_to_process = db.query("""
SELECT files_sheets.id AS files_sheets_id, 
       files_sheets.file_id, 
       files_sheets.sheet_id, 
       files_sheets.header_start, 
       files_sheets.header_end,
       files_sheets.header_values,
       files.path AS file_path,
       sheets.name AS sheet_name
FROM files_sheets
JOIN files ON files_sheets.file_id = files.id
JOIN sheets ON files_sheets.sheet_id = sheets.id
WHERE sheets.skip = 0
  AND files_sheets.added = '2018-04-03'

AND files.ignore = 0
AND files_sheets.header_start IS NOT NULL
AND files_sheets.header_start <> 'PROBLEM'
AND files_sheets.header_end NOT LIKE '%PROBLEM%'
ORDER BY file_id, files_sheets_id;
""")

In [29]:
tab_all = db['full_raw_scrubbed']
tab_vars = db['variables']
tab_files_sheets = db['files_sheets']

In [30]:
# We need to create some columns...
raw_columns = [k for k in tab_all.find_one().keys()]
flag_cols = [c for c in raw_columns if 'flag_' in c]
col_names_we_need = sorted(list(set([r['normalized'] for r in tab_vars.find()])))
missing_col_names = set(col_names_we_need) - set(raw_columns)

In [31]:
# Create the new columns
tab_all.create_column("added", sqlalchemy.String)

for c in missing_col_names:
    tab_all.create_column(c, sqlalchemy.String)

In [32]:
# Rename var_lookup to work with code below

# Create an in-memory lookup table for variables
var_lookup = {}
for r in tab_vars.find():
    var_lookup[r['orig']] = r['normalized']

In [33]:
# Only reopen files when necessary
working_file_id = -1
active_file_path = None
active_workbook = None


for rec in sheets_to_process:
    import_status = ""
    
    if rec['file_id'] > working_file_id:
        working_file_id = rec['file_id']
        active_file_path = rec['file_path']
        try:
            active_workbook = openpyxl.load_workbook(active_file_path,read_only=True,guess_types=False,data_only=True)
        except:
            import_status = "Unable to open file"
            import_status = "imported"
            tab_files_sheets.update({"id":rec['files_sheets_id'],"import_status":import_status},["id"])
            print("Unable to open",active_file_path)
            
            active_workbook = None
            active_file_path = None
            working_file_id = -1
            continue
            
    # Process the active file
    sheet_name = rec['sheet_name']
    header_start = rec['header_start']
    header_end = rec['header_end']
    
    # Unable to find a header in this sheet. Mark the record
    if header_start is None or header_end is None or "PROBLEM" in header_start or "PROBLEM" in header_end:
        import_status = "skipped"
        continue
    else:
        worksheet = active_workbook.get_sheet_by_name(sheet_name)
        last_row = worksheet.max_row
        
        # Sometimes worksheet.max_row doesn't return a value
        if last_row == None:
            import_status = "imported: last row None"
            last_row = 10000
        
        data_start = header_start[0] + str(int(header_start[1:])+1)
        data_end = header_end[0] + str(last_row)
        data_range_string = data_start + ":" + data_end
        
        # These are stored as a list converted to a string. Convert back to a list for enumeration
        header_values = ast.literal_eval(rec['header_values'])
        sheet_data = []
        
        try:
            for datarow in worksheet[data_range_string]:
                record = {}
                for idx,cell in enumerate(datarow):
                    header_val = header_values[idx]

                    # Get the normalized value
                    header_val = var_lookup[header_val]

                    cell_value = cell.value                

                    try:
                        cell_value = cell_value.strip()
                    except:
                        pass

                    # Cannot write datetime objects to the database
                    # Unless they are first converted to strings

                    if isinstance(cell_value,datetime.datetime):
                        cell_value = openpyxl.utils.datetime.datetime_to_W3CDTF(cell_value)
                    elif isinstance(cell_value,datetime.time):
                        cell_value = str(cell_value)
                    elif cell_value is not None:
                        cell_value = str(cell_value)

                    # There's already a value in the field and it should be a string.
                    # If it is our representation of none, replace it
                    if header_val in record.keys():
                        if record[header_val] == '.' or record[header_val] is None:
                            record[header_val] = cell_value
                        elif cell_value is not None:
                            record[header_val] = record[header_val] + ", " + cell_value
                    else:

                        # Blank strings instead of NULL will help us know which fields were available for the record
                        if cell_value is None:
                            record[header_val] = '.'
                        else:
                            record[header_val] = cell_value

                sheet_data.append(record)

            # Remove from sheet_data blank rows
            rich_sheet_data = []
            for staged in sheet_data:
                working_copy = copy.deepcopy(staged)
                try:
                    # Put "passover columns" here. They will be removed from the record
                    # before it is evaluated as "empty." Number is a good example because there are sheets
                    # where somebody dragged numbers down a column in preparation for a lot of data but 
                    # never actually used all of the numbered rows
                    del working_copy['number']
                except:
                    pass

                if all((x == None or x == '.' or x == '') for x in list(working_copy.values())):
                    continue
                else:
                    staged['a_file_id'] = rec['file_id']
                    staged['a_files_sheets_id'] = rec['files_sheets_id']
                    staged['a_sheet_id'] = rec['sheet_id']
                    staged['added'] = '2018-04-03'

                    rich_sheet_data.append(staged)

            # Try to perform a bulk insert
            tab_all.insert_many(rich_sheet_data)

            # Update the status of the worksheet
            import_status = "imported"
            tab_files_sheets.update({"id":rec['files_sheets_id'],"import_status":import_status},["id"])
            
        except Exception as ex:
            
            print("\n--------------------------------------------------------------------------")
            print(ex)
            print("Failure")
            print("file_id",rec['file_id'],"files_sheets_id",rec['files_sheets_id'],"sheet_id",rec['sheet_id'])
            print(active_file_path,sheet_name)
            print("Header range:",header_start,header_end)
            print("Data range:",data_start,data_end)
            print("data_range_string",data_range_string)
            
            tab_files_sheets.update({"id":rec['files_sheets_id'],"import_status":import_status},["id"])

### Next

- scrub PII in situ
- export to arabic_values
- translate
- flag
- generate flagged dataset

### Scrub PII

In [34]:
import hashlib

In [35]:
to_update = {}

# Do not save this value in a source code repository!
salt = 'REDACTED'.encode()

In [36]:
fields = [
    "info_name",
    "info_name_author",
    "info_name_caregiver",
    "info_name_facility",
    "info_name_group",
    "info_name_of_coach",
    "info_name_processor",
    "info_name_surgeon",
    "info_phone_skype"
]

In [37]:
for rec in tab_all.find(added='2018-04-03'):
    for pii_field in fields:
        if rec[pii_field] is None or rec[pii_field] == '.':
            continue
        else:
            # Hash the value in the field
            h = hashlib.sha256()
            h.update(rec[pii_field].encode())
            h.update(salt)
            
            if rec['id'] not in to_update.keys():
                to_update[rec['id']] = {'id':rec['id']}

            to_update[rec['id']][pii_field] = h.hexdigest()

In [38]:
for k in to_update.keys():
    tab_all.update(to_update[k],['id'])

### Export to arabic_values and translate

Be careful here because arabic_values no longer auto-increments the id, so it has to be set manually.

In [39]:
tab_raw = db['full_raw_scrubbed']
tab_arabic = db['arabic_values']
tab_vars = db['variables']

column_names = db.query("SELECT DISTINCT(normalized) FROM variables;")
column_names = sorted([r['normalized'] for r in column_names])

# We don't want to work with the values in the fields that have been hashed,
# so remove them from the list of variables to query.
fields = [
    "info_name",
    "info_name_author",
    "info_name_caregiver",
    "info_name_facility",
    "info_name_group",
    "info_name_of_coach",
    "info_name_processor",
    "info_name_surgeon",
    "info_phone_skype",
    "date",
    "date_first_exam",
    "death_date"
]
column_names = [e for e in column_names if e not in fields]

arabic_lookup = set([r['arabic'] for r in tab_arabic.find()])

In [40]:
max_id_results = db.query("SELECT max(id) FROM arabic_values;")
for r in max_id_results:
    max_id = int(r['max(id)'])

current_id = max_id + 1

In [41]:
buffer = []
for col in column_names:
    col_values = db.query("""
        SELECT DISTINCT([""" + col + """]) 
        FROM full_raw_scrubbed 
        WHERE added = '2018-04-03'
        AND [""" + col + """] IS NOT NULL
        AND [""" + col + """] <> '.'
        AND [""" + col + """] <> '';
        """)
    col_values = [r[col] for r in col_values]

    # Create a table of unique Arabic values
    for v in col_values:
        if v in arabic_lookup:
            continue
        # Skip numbers
        if v.replace(",",".").replace('.','',1).isdigit():
            continue
        else:
            r = {"id":current_id,"arabic":v,"added":'2018-04-03'}
            current_id += 1
            buffer.append(r)
            arabic_lookup.add(v)
            
# 367621

tab_arabic.insert_many(buffer)

In [42]:
# Display errors in realtime
import ipywidgets as widgets
import time

In [43]:
record_counter = widgets.HTML(value="Records: 0",continuous_update=True)
character_counter = widgets.HTML(value="Characters: 0",continuous_update=True)
error_counter = widgets.HTML(value="Errors: 0",continuous_update=True)
latest_translation = widgets.HTML(value="Latest: -", continuous_update=True)

In [44]:
record_counter

In [45]:
character_counter

In [46]:
error_counter

In [47]:
latest_translation

In [48]:
character_count = 0
record_count = 0
error_count = 0

translate_client = translate.Client()
target_lang = 'en'

recordset = [r for r in tab_arabic.find(added='2018-04-03')]
print("recordset length",len(recordset))

update_records = []

for row in recordset:
    arabic_string = row['arabic']
    arabic_string.replace("_"," ").replace("\n"," ").replace("  "," ").replace("\t"," ").strip()
    try:
        english = translate_client.translate(arabic_string, target_language=target_lang)
        english = english['translatedText']
        new_rec = {
            "id":row['id'],
            "google_translate_feb":english
        }

        update_records.append(new_rec)

        character_count += len(row['arabic'])
        character_counter.value = "Characters: " + str(character_count)
        record_count += 1
        record_counter.value = "Records: " + str(record_count)
        latest_translation.value = "Latest: " + english
        
        del new_rec

    except:
        error_count += 1
        error_counter.value = "Errors: " + str(error_count)
        time.sleep(10)
        
print("--------------\nDone")
print("Record Count",record_count)
print("Character Count",character_count)

recordset length 3562
--------------
Done
Record Count 3562
Character Count 86171


In [50]:
for rec in update_records:
    tab_arabic.update(rec,['id'])

### Tokenize & translate missing tokens

In [52]:
import re

In [53]:
tab_tokens = db['arabic_tokens']
tab_arabic_values_tokens = db['arabic_values_tokens']

good_char = 'ا'
bad_char1 = 'أ'
bad_char2 = 'إ'
bad_char3 = 'آ'

delimiters = "(", ")", ",", "/", ".", "1","2","3","4","5","6","7","8","9","0","+","_","-","\\","=","ـ"
regexpattern = "|".join(map(re.escape,delimiters))

In [54]:
token_lookup = {}

recordset = [r for r in tab_arabic.find(added='2018-04-03')]

for rec in recordset:
    tokens = rec['arabic']
    tokens = tokens.replace(bad_char1,good_char)
    tokens = tokens.replace(bad_char2,good_char)
    tokens = tokens.replace(bad_char3,good_char)
    
    tokens = [t.strip() for t in re.split(regexpattern,tokens) if t.strip() != '']
    
    for token in tokens:
        if token not in token_lookup.keys():
            token_lookup[token] = {"token_id":None, "arabic_values_id":[]}
        token_lookup[token]["arabic_values_id"].append(rec['id'])
        

In [55]:
len(token_lookup.keys())

2976

In [56]:
existing_token_lookup = {}

for rec in tab_tokens.find():
    existing_token_lookup[rec['token']] = rec['id']

In [64]:
existing_toks = list(existing_token_lookup.keys())

for tok in token_lookup.keys():
    if tok in existing_toks:
        token_lookup[tok]['token_id'] = existing_token_lookup[tok]

In [70]:
new_tokens = []

for k,v in token_lookup.items():
    if v['token_id'] is None:
        new_tok = {"token":k, "added":'2018-04-03'}
        new_tokens.append(new_tok)

In [75]:
tab_tokens.insert_many(new_tokens)

In [76]:
existing_token_lookup = {}

for rec in tab_tokens.find():
    existing_token_lookup[rec['token']] = rec['id']

In [77]:
for tok in token_lookup.keys():
    token_lookup[tok]['token_id'] = existing_token_lookup[tok]

In [79]:
token_lookup['susp']

{'arabic_values_id': [396790, 396791, 396792, 396793, 396794, 396795],
 'token_id': 153469}

In [80]:
join_records = []
for token in token_lookup.keys():
    for orig_id in token_lookup[token]["arabic_values_id"]:
        join_records.append({"token_id":token_lookup[token]["token_id"],"arabic_values_id":orig_id, "added":'2018-04-03'})

In [81]:
for j in join_records:
    tab_arabic_values_tokens.insert(j)

In [83]:
# Now translate the new tokens

translate_client = translate.Client()
target_lang = 'en'

updates = []

tokens_to_translate = list(tab_tokens.find(added='2018-04-03'))

for rec in tokens_to_translate:
    arabic = rec['token']
    english = translate_client.translate(arabic, target_language=target_lang)
    english = english['translatedText']
    update_rec = {
        "id":rec['id'],
        "google_translate_feb":english,
        "translation":english
    }

    updates.append(update_rec)

In [87]:
for rec in updates:
    tab_tokens.update(rec,['id'])

In [105]:
# Now join the translated tokens back to the arabic_values table for flag generation

join_lookup = {}

for record in tab_arabic_values_tokens.find():
    if record['arabic_values_id'] not in join_lookup.keys():
        join_lookup[record['arabic_values_id']] = set()
    join_lookup[record['arabic_values_id']].add(record['token_id'])

In [106]:
token_lookup = {}

# Not new, actually...
new_tokens = list(tab_tokens.find())

for record in new_tokens:
    translation = record['translation']
    
    # Nothing to join back
    if translation is None:
        continue
    else:
        translation = [x.lower().strip() for x in translation.split(",") if x.strip() != '']
        
    token_lookup[record['id']] = translation

In [111]:
join_lookup[397106]

KeyError: 397106

In [112]:
tab_arabic_values = db['arabic_values']

ar_values_to_update = []

new_ar_val_ids = []

for rec in tab_arabic_values.find(added='2018-04-03'):
    new_ar_val_ids.append(rec['id'])

errors = 0
    
for key in new_ar_val_ids:
    update_rec = {'id':key}
    all_vals = []
    try:
        for token_rec in sorted(list(join_lookup[key])):
            try:
                all_vals += token_lookup[token_rec]
            except:
                pass
        if len(all_vals) == 0:
            continue

        hum_trans = ", ".join(all_vals)
        update_rec['google_tokens_joined'] = hum_trans
        ar_values_to_update.append(update_rec)
    except Exception:
        errors += 1
        pass
    
errors

2

In [113]:
len(ar_values_to_update)

3560

In [115]:
ar_values_to_update[50:60]

[{'google_tokens_joined': 'another reason', 'id': 396835},
 {'google_tokens_joined': 'unknown reason', 'id': 396836},
 {'google_tokens_joined': 'injuries اذيات ورضوض', 'id': 396837},
 {'google_tokens_joined': 'hypertension', 'id': 396838},
 {'google_tokens_joined': 'watery diarrhoea water desalination', 'id': 396839},
 {'google_tokens_joined': 'lrti respiratory infection', 'id': 396840},
 {'google_tokens_joined': 'امراض جلدية, باستثناء الليشمانيا, skin disease, not leishmaniasis',
  'id': 396841},
 {'google_tokens_joined': 'peptic ulcer', 'id': 396842},
 {'google_tokens_joined': 'other diseases', 'id': 396843},
 {'google_tokens_joined': 'urinary tract infection, urinary tract infection, dws',
  'id': 396844}]

In [116]:
for rec in ar_values_to_update:
    tab_arabic_values.update(rec,['id'])

### Flag generation

Only need to generate flags for the new arabic_values records & then generate the new flags for the flag dataset.

In [143]:
# Get a reference to the arabic_values table
tab_arabic_values = db['arabic_values']

In [144]:
try:
    db['full_raw_flags'].drop()
    print("Dropped full_raw_flags")
except:
    pass

try:
    db['full_raw_flags_reduced'].drop()
    print("Dropped full_raw_flags_reduced").drop()
except:
    pass

# One SQLite limitation is you cannot drop columns, so you have to create a new table and then rename it.
preserve_fields = [k for k in tab_arabic_values.find_one().keys() if 'flag_' not in k]

# We don't use result but assigning it skips printing some garbage below
result = db.query("""
CREATE TABLE new_arabic_values AS 
    SELECT """ + ",".join(preserve_fields) + """ 
    FROM arabic_values;
""")

# Drop the original arabic_values table
tab_arabic_values.drop()

# Rename new_arabic_values to arabic_values & now we have a table with no flag columns
result = db.query("""
ALTER TABLE new_arabic_values RENAME TO arabic_values;
""")

Dropped full_raw_flags
Dropped full_raw_flags_reduced


In [145]:
# Now because we futzed with the arabic_values table, we have to create a new reference to the database
# and to our arabic_values table. The db object stores some schema information that isn't updated with
# our direct query calls above.

del db
del tab_arabic_values

db = dataset.connect("sqlite:///sams_data_phase22.sqlite")
tab_arabic_values = db['arabic_values']

In [146]:
# Now create an in-memory representation of the arabic_values table
# and store it in variable `data`
data = [x for x in tab_arabic_values]

In [147]:
# Update this if you want to change what flags you are making on the dataset.
# The logic for creating them is in the following cell.

# Require and flag term
flag_terms = [
    "blunt",
    "explosive",
    "blast",
    "stab",
    "upper extremity",
    "lower extremity",
    "neck",
    "chest",
    "back",
    "spinal",
    "neurologic",
    "nerve",
    "vascular",
    "orthopedic",
    "fracture",
    "suspected",
    "follow-up",
    "complication",
    "history of",
    "traffic accident"
]

# require all terms - not in use at the moment
multiple_flag_terms = [
#     ("burn","fracture")
]

# require any of the terms but name the flag after the first
synonym_flag_terms = [
    ("allergy", "allergic"),
    ("anemia", "thalassemia"),
    ("cancer", "bcc", "leukemia", "lymphoma", "malignancy", "malignant", "scc"),
    ("cardiovascular"," asd "," vsd ","cholesterol","hypercholesterolemia","hyperlipidemia","hypertriglyceridemia","triglycerides","blood pressure"," bp ","high blood pressure","hypertension","acute coronary syndrome","angina","arrhythmia","atrial fibrillation"," avr ","cardiac ischemia","chest pain","clot","clotting","coronary atery","coronary heart disease","coronary ischemia","dvt","endocarditis","heart attack","heart disease","heart failure","heart valve","hf","hypotension","ihd"," mi ","mitral valve prolapse","mvr","myocardial hypoperfusion","myocardial infarction","palpitations","pericarditis","pulmonary embolism","pvd","svt","thromboembolism","thrombophlebitis","thrombosis","vasculitis"),
    ("congenital", "asd", "vsd"),
    ("dehydration", "dehydration", "hypovolemic shock"),
    ("dental complaint", "dental", "gingivitis", " gum ", "odonitis", "teeth", "tooth", "toothache"),
    ("derm", "acne","alopecia","blisters","cellulitis","dermatitis","dermoid","dry skin","eczema","folliculitis","hair loss","inflammatory papules","intertrigo","itch","lice","pruritis","psoriasis","rash","ringworm","scabies","skin disease","skin disorder","skin eruption","skin infection","skin lesion","tinea","warts"),
    ("diabetes","diabetic","DKA","glucose","hyperglycemia","hypoglycemia","sugar"),
    ("endocrine","hyperthyroid","hyperthyroidism","hypocalcemia","hypothyroid","hypothyroidism","parathyroid","thyroid"," TSH "),
    ("infection","conjunctivitis","eye discharge","eye infection","keratoconjunctivitis","ophthalmic infection"),
    ("pain", "corneal inflammation", "eye sensitivity", "keratitis", "pain in the eye"),
    ("fatigue", "exhaustion", "tired", "tiredness"),
    ("fever", "hyperthermia", "temperature"),
    ("constipation", "intestinal stasis"),
    ("shrapnel", "fragments","sliver","splinter"),
    ("musculoskeletal pain","ankylosing spondylitis","arthralgia","Arthritis","back pain","bruise","bruising","chondritis","contusion","costochondritis","disc herniation","disc herniation","discitis","elbow pain","extremity pain","gout","inflammation of the shoulder","joint","knee degeneration","knee inflammation","knee pain","loin pain","low back pain","lumbar pain","musclar pain","Muscle spasm","muscular pain","myalgia","myositis","neck pain","osteoarthritis","osteomyelitis","osteomylitis","plantar fasciitis","polyarthritis","rheumatism","sacroiliitis","spine degeneration","sprain","strain","synovitis","tendinitis","tendonitis","tendonopathy","tmj"),
    ("headache", "head pain"),
    ("stroke","cerebral accident","cerebral hemorrhage","cerebral infarction","cerebral ischemia","cerebrovascular accident"," cva "),
    ("gunshot", " shot "),
    
    # Prior flags, preserved
    ("facial","face"),
    ("pelvic","pelvis"),
    ("head","eye","ear","face","brain","scalp","mouth","nose"),
    ("spine","spinal"),
    ("abdomen","abdominal")
]

# require the first term and the absence of the remaining terms
# name the flag after the first term.
complex_flag_terms = [
    ("urologic","neurologic"),
    ("burn","heartburn"),
    ("trauma", "psychological trauma")
]

# Look for any of the terms in terms_to_find but only apply if terms in terms_to_avoid are absent.
# Check human or google translation (ht, gt)

complex_set_flag_terms = [
    {
        "flag_name": "hyperlipidemia",
        "terms_to_find": ["blood pressure", "bp", "high blood pressure", "hypertension"],
        "terms_to_avoid": ["hypotension"],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "ENT",
        "terms_to_find": ["adenoiditis","ear congestion","ear discharge","ear infection","ear inflammation","eustachian tube infection","mucositis","mumps","nasal congestion","nose congestion","otitis","otorrhea","pharyngitis","throat ache","tonsillitis","tonsils enlargement","cerumen impaction","dysphagia","earache","epistaxis","hearing impairment","hearing loss","nasal obstruction","pain in the ear","pharyngeal pain","pharynx pain","swallowing pain","vestibulitis"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "infection",
        "terms_to_find": ["adenoiditis","ear congestion","ear discharge","ear infection","ear inflammation","eustachian tube infection","laryngitis","mucositis","mumps","nasal congestion","nose congestion","otitis","otorrhea","pharyngitis","rhinitis","sinusitis","throat ache","tonsillitis","tonsils enlargement"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "eye",
        "terms_to_find": ["conjunctivitis","eye discharge","eye infection","keratoconjunctivitis","ophthalmic infection","corneal inflammation","eye sensitivity","keratitis","pain in the eye","blepharitis","cataract","eye redness","eyelid","eye-redness","glaucoma","left eye","my eye","npdr","pterygium","pupil","redness of the eye","retinal","retinopathy","right eye","swelling of the eye","uveitis","vision"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "gi_complaint",
        "terms_to_find": ["abdominal injury","apendicitis","appendicitis","belly pain","bile duct obstruction","bile stones","cholecystitis","colic","colitis","colon spasm","Crohn","duodenal ulcer","enteritis","epigastric pain","flank pain","gallbladder inflammation","gastric pain","gastric ulcer","gastritis","gastroenteritis","gastrointestinal infection","hiatal hernia","ibd","ibs","indigestion","inflammation of the stomach","intestinal pain","intestinal ulcer","pain in the stomach","pancreatitis","peptic ulcer","peritoneal inflammation","peritonitis","sore stomach","stomach hurts","stomach pain","Digestive bleed","Gastric bleeding","Gastric hemorrhage","Gastrointestinal bleeding","hemorrhoids","Ulcer of the colon","Constipation","intestinal stasis","diarrhea","dysentery","food poisoning","giardia","typhoid","cirrhosis","hapatitis","hep a","hep b","hep c","hepatic","jaundice","nausea","vomiting","vomitting","anal fissure","bloating","celiac disease","esophageal reflux","gastroesophageal reflux","gerd","heartburn","inguinal hernia","malabsorption","umbilical fistula","umbilical hernia"],
        "terms_to_avoid": ["renal colic"],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "abdominal_pain",
        "terms_to_find": ["abdominal injury","apendicitis","appendicitis","belly pain","bile duct obstruction","bile stones","cholecystitis","colic","colitis","colon spasm","Crohn","duodenal ulcer","enteritis","epigastric pain","flank pain","gallbladder inflammation","gastric pain","gastric ulcer","gastritis","gastroenteritis","gastrointestinal infection","hiatal hernia","ibd","ibs","indigestion","inflammation of the stomach","intestinal pain","intestinal ulcer","pain in the stomach","pancreatitis","peptic ulcer","peritoneal inflammation","peritonitis","sore stomach","stomach hurts","stomach pain"],
        "terms_to_avoid": ["renal colic"],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "bleed",
        "terms_to_find": ["Digestive bleed","Gastric bleeding","Gastric hemorrhage","Gastrointestinal bleeding","hemorrhoids","Ulcer of the colon"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "diarrhea_dysentery",
        "terms_to_find": ["diarrhea","dysentery","food poisoning","giardia","typhoid"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "liver_dysfunction",
        "terms_to_find": ["cirrhosis","hapatitis","hep a","hep b","hep c","hepatic","jaundice"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "nausea_vomiting",
        "terms_to_find": ["nausea","vomiting","vomitting"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "gu",
        "terms_to_find": ["cystitis","dysuria","epididymitis","genital infection","herpes","orchitis","sexually transmitted infection","urethritis","urinary infection","Urinary tract infection","urogenital infection","UTI","bladder","hematuria","incontinence","pelvic mass","urinary disorder","urinary retention","urinary symptoms","varicocele"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "infection",
        "terms_to_find": ["cystitis","dysuria","epididymitis","genital infection","herpes","orchitis","sexually transmitted infection","urethritis","urinary infection","Urinary tract infection","urogenital infection","UTI"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "gyn_women",
        "terms_to_find": ["breast","endometriosis","fibroids","gynecological","hot flashes","irregular cycle","mastitis","menopause","menstrual","ovarian","ovary","ovulation","reproductive health","uterine","uterus","vagina","vaginal","vaginitis"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "injury",
        "terms_to_find": ["bite","sting","stinging","cut","wound","injury","blast","burn","fracture","gunshot","shot","hemiplegia","paralysis","paraplegia","quadriplegia","fragments","shrapnel","sliver","splinter","traffic accident","abrasion","bruise","bruising","Concussion","contusion","falling","knee rupture","splint","trauma"],
        "terms_to_avoid": ["psychological trauma","heartburn"],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "injury",
        "terms_to_find": ["ulcer"],
        "terms_to_avoid": ["gastric", "stomach", "peptic", "intestinal", "duodenal"],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "injury_neuro",
        "terms_to_find": ["hemiplegia","paralysis","paraplegia","quadriplegia"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "malnutrition",
        "terms_to_find": ["delayed growth","growth delay","growth retardation","short stature","malnutrition"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "growth_delay",
        "terms_to_find": ["delayed growth","growth delay","growth retardation","short stature"],
        "terms_to_avoid": [],
        "check": ["ht","gt"]
    },
    {
        "flag_name": "mental_health",
        "terms_to_find": ["anxiety","bipolar","mental illness","personality disorder","post traumatic syndrome","post-traumatic syndrome","psychiatric","psychological","ptsd","schizophrenia"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "mental_health",
        "terms_to_find": ["depression"],
        "terms_to_avoid": [],
        "check": ["ht"]
    },
    {
        "flag_name": "neuro_complaint",
        "terms_to_find": ["head pain","headache","cerebral accident","cerebral hemorrhage","cerebral infarction","cerebral ischemia","cerebrovascular accident","cva","stroke","benign paroxysmal postitional vertigo","brachial plexus","brain infection","brain tumor","cauda equina","cerebral palsy","cervical root","convulsion","convulsions","dementia","dizziness","encephalitis","epilepsy","epileptic","foot drop","hand drop","meningitis","meningocele","migraine","nerve","neuritis","neurodegenerative","neurological","neuropathy","numbness","nystagmus","polyneuritis","sciatica","seizure","subarachnoid hemorrhage","TIA","tinnitus","Vertigo"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "other_infection",
        "terms_to_find": ["mediterranean fever","mf","abcess","abscess","sepsis","septic shock","bacteremia","brucellosis","chickenpox","diphtheria","finger infection","foot infection","fungal","hand foot","hand infection","hand mouth","hand-foot","hookworm","infection of blood","intestinal worms","leprosy","lymphadenitis","lymphadenopathy","measles","nemotodes","omphalitis","parasite","pinworm","rheumatic fever","rubella","scarlet fever","thrush","toe infection","worms"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "other_infection",
        "terms_to_find": ["leishmania","leishmaniasis"],
        "terms_to_avoid": ["excluding leishmaniasis", "excluding leishmania", "except leishmaniasis"],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "pregnancy",
        "terms_to_find": ["abortion","antenatal","birth","caesarean section","csection","delivery","gestation","miscarriage","placenta","postnatal","postpartum","pregnancy","pregnant","prenatal"],
        "terms_to_avoid": ["not pregnant"],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "renal",
        "terms_to_find": ["hydronephrosis","kidney cysts","kidney failure","kidney stone","nephritis","nephrolithiasis","nephropathy","pyelonephritis","renal calculi","renal calculus","renal failure","renal impairment","renal insufficiency","renal stones"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "respiratory",
        "terms_to_find": ["laryngitis","rhinitis","sinusitis","bronchiolitis","bronchitis","cold","congestion","cough","croup","flu","grippe","influenza","penumonia","pneumonia","pneumonitis","pulmonary infection","respiratory infection","respiratory tract infection","rhinorrhea","running nose","runny nose","tuberculosis","urti","asthma","bronchospasm","COPD","difficulty breathing","dyspnea","emphysema","hemoptysis","lung disease","nebulization","nebulizing","pulmonary disease","pulmonary fibrosis","shortness of breath","sneezing"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "infection",
        "terms_to_find": ["bronchiolitis","bronchitis","cold","congestion","cough","croup","flu","grippe","influenza","penumonia","pneumonia","pneumonitis","pulmonary infection","respiratory infection","respiratory tract infection","rhinorrhea","running nose","runny nose","tuberculosis","urti"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "wound",
        "terms_to_find": ["dressing change"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    },
    {
        "flag_name": "animal_insect_bite",
        "terms_to_find": ["bite","sting","stinging"],
        "terms_to_avoid": [],
        "check": ["ht", "gt"]
    }
]

In [148]:
# Store the rows we change here
# so that we can update the table

update_data = []

# Iterate through the in-memory representation
for rec in data:
    # Create a placeholder update record
    update_rec = {'id':rec['id']}
    
    # A flag we'll use to determine whether the record needs to be updated
    update_record = False
    
    # Get the human_translate value from the record
    ht = rec['human_translate']
    
    # If it is not None, then convert it to lowercase
    if ht:
        ht = ht.lower()
    
    # Get the google_translate value and convert it to lowercase
    # We are not currently using this to generate flags so it is commented out
    # but you could substitute it in below or write additional code if you want
    # to use it for flag generation
    gt = rec['google_translate_feb']
    if gt:
        gt = gt.lower()
    
    # Look at google_tokens_joined field
    gtj = rec['google_tokens_joined']
    if gtj:
        gtj = gtj.lower()
    
    
    # Walk through the different flag types from above and check whether the 
    # human_translate value matches for that flag. If so, create the update record
    # for that flag and then mark our update boolean indicator true so that we know
    # to update the appropriate record in the database. All records that will be updated
    # have their update record put into the update_data list.
    for term in flag_terms:
        if (ht and term in ht) or (gt and term in gt) or (gtj and term in gtj):
            update_rec["flag_" + "_".join(term.replace("-","_").split())] = 1
            update_record = True

    for tup in multiple_flag_terms:
        if (ht and all(x in ht for x in tup)) or (gt and all(x in gt for x in tup)) or (gtj and all(x in gtj for x in tup)):
            update_rec["flag_" + "_and_".join(tup)] = 1
            update_record = True

    for tup in synonym_flag_terms:
        if (ht and any(x in ht for x in tup)) or (gt and any(x in gt for x in tup)) or (gtj and any(x in gtj for x in tup)):
            update_rec["flag_" + "_".join(tup[0].split())] = 1
            update_record = True

    for tup in complex_flag_terms:
        if (ht and tup[0] in ht and not any(x in ht for x in tup[1:])) or (gt and tup[0] in gt and not any(x in gt for x in tup[1:])) or (gtj and tup[0] in gtj and not any(x in gtj for x in tup[1:])):
            update_rec["flag_" + tup[0].replace(" ","_").replace("-","_")] = 1
            update_record = True

    # complex_set_flag_terms
    for rule in complex_set_flag_terms:
        flag_name = "flag_" + "_".join(rule['flag_name'].split())

        # Continue because we already set this flag
        if flag_name in update_rec.keys():
            if update_rec[flag_name] == 1:
                continue

        if "ht" in rule['check']:
            if ht and any(x in ht for x in rule["terms_to_find"]) and not any(x in ht for x in rule["terms_to_avoid"]):
                update_rec[flag_name] = 1
                update_record = True
                # We set the flag so stop searching
                continue

        if "gt" in rule['check']:
            if gt and any(x in gt for x in rule["terms_to_find"]) and not any(x in gt for x in rule["terms_to_avoid"]):
                update_rec[flag_name] = 1
                update_record = True
                # We set the flag so stop searching
                continue

            if gtj and any(x in gtj for x in rule["terms_to_find"]) and not any(x in gtj for x in rule["terms_to_avoid"]):
                update_rec[flag_name] = 1
                update_record = True
                # We set the flag so stop searching
                continue
            
    # Handle war-related separately. This very likely can be improved upon
    if ht and 'war-related injury' in ht and 'not war-related injury' not in ht:
        update_rec['flag_conflict_related'] = 1
        update_record = True
    
    # If we created any flags, update_record is true so put this record in the list 
    # of records to update.
    if update_record:
        # Create comprehensive injury flag per Ranya's request
        keys = update_rec.keys()
        if ('flag_injury' in keys and update_rec['flag_injury'] == 1) or ('flag_wound' in keys and update_rec['flag_wound'] == 1):
            update_rec['flag_comprehensive_injury'] = 1
        else:
            update_rec['flag_comprehensive_injury'] = 0
                
        update_data.append(update_rec)

In [149]:
# How many records are we going to update in the arabic_values table?
len(update_data)

119891

In [150]:
# What do the update records look like? 
update_data[-10:]

[{'flag_comprehensive_injury': 0, 'flag_head': 1, 'id': 400108},
 {'flag_comprehensive_injury': 0, 'flag_pregnancy': 1, 'id': 400115},
 {'flag_comprehensive_injury': 0, 'flag_follow_up': 1, 'id': 400117},
 {'flag_comprehensive_injury': 0, 'flag_follow_up': 1, 'id': 400118},
 {'flag_comprehensive_injury': 0, 'flag_follow_up': 1, 'id': 400119},
 {'flag_comprehensive_injury': 0, 'flag_derm': 1, 'id': 400169},
 {'flag_comprehensive_injury': 0, 'flag_derm': 1, 'id': 400171},
 {'flag_comprehensive_injury': 0, 'flag_neck': 1, 'id': 400229},
 {'flag_comprehensive_injury': 0,
  'flag_infection': 1,
  'flag_respiratory': 1,
  'id': 400269},
 {'flag_comprehensive_injury': 0, 'flag_derm': 1, 'id': 400307}]

In [151]:
# Update the arabic_values table with the update_records' data
# 1. Create the columns we need
# 2. Bulk update for each column

flag_cols = set()
for rec in update_data:
    for k in rec.keys():
        if k != 'id':
            flag_cols.add(k)
flag_cols = sorted(list(flag_cols))

# The trick here is to get the id from a record in arabic values and update that
# record with a None value for each of these flags - that will cause dataset to generate the columns
ref_rec = tab_arabic_values.find_one()
ref_rec_update = {'id':ref_rec['id']}
for col in flag_cols:
    ref_rec_update[col] = None
tab_arabic_values.update(ref_rec_update, ['id'])

# At this point maybe open DB Browser for SQLite to make sure the columns were created.
# The 1 that prints below is the number of records updated.

1

In [152]:
# Now iterate through the flag cols and create a list of each record that needs to set the value for each
# flag column and then bulk update. It is orders of magnitude faster to do it this way than one by one.

# Note - this is generating and executing some super gnarly long SQL queries with tons of ID numbers

for col in flag_cols:
    recs_to_update = []
    for rec in update_data:
        if col in rec.keys():
            recs_to_update.append(rec['id'])
    recs_to_update = sorted(recs_to_update)

    db.query("""
    UPDATE arabic_values
    SET """ + col + """ = 1 
    WHERE id IN (""" + ",".join([str(a) for a in recs_to_update]) +""");
    """)
    
# After this runs, check in the database against to make sure the flags were properly applied.

In [153]:
# Get a new db connection again in case the schema has changed.
# This probably isn't necessary but is a safety measure.

try:
    del db
    del tab_arabic_values
except:
    pass

db = dataset.connect("sqlite:///sams_data_phase22.sqlite")
tab_arabic_values = db['arabic_values']

In [154]:
# Get a reference to the raw Arabic data table
tab_raw_ar = db['full_raw_scrubbed']

In [155]:
# Get the list of variables used in full_raw_scrubbed and full_raw_english
rec_raw = tab_raw_ar.find_one()
variables = list(rec_raw.keys())
print(",".join(variables))

# Due to previous work, there are flag columns in the full_raw_scrubbed table, but we will ignore them
# because they aren't used in this flag-generation methodology. 

id,a_file_id,a_files_sheets_id,a_sheet_id,acceptance_pattern,analysis,analysis_request,analysis_type,anesthesia_type,assign_method,case,category,center,clinic,clinical_case,col_1,col_2,col_3,col_4,col_5,col_6,col_misc,col_moawak,col_none,col_null,col_to,conflict_related,consultations,daily_number,data_validation,date,date_first_exam,death,death_cause,death_certificate,death_date,death_location,death_time,department,diagnosis,diagnosis_confirmed,discharge,discharge_date,discharge_status,discharged_to,disclaimers,disease,displaced,displacement_duration,dose,drug_class,er,events,exam_type,examination_1,examination_2,examination_3,examination_4,examination_type,facility_type,housing,housing_persons_number,image,image_request,image_type,import_status,info_age,info_card_number,info_card_type,info_care_type,info_geo_address,info_geo_area,info_geo_community,info_geo_country_of_origin,info_geo_district,info_geo_governorate,info_geo_injury_city,info_geo_injury_site,info_geo_injury_state,info_geo

In [156]:
# Create the in-memory arabic_values lookup
# This time, since we created the flags, they'll be in the records

arabic_lookup = {}
arabic_values = [x for x in tab_arabic_values.find()]

for v in arabic_values:
    arabic_lookup[v['arabic']] = v

In [157]:
# Let's test that a value we pull out of the database has a hit in the lookup table.
test_rec = tab_raw_ar.find_one()
diagnosis = test_rec['diagnosis']
print(diagnosis)
print("---------------------- Lookup result below")
print(arabic_lookup[diagnosis])

التهاب مجاري تنفسية سفلى, التهاب قصبات
---------------------- Lookup result below
OrderedDict([('id', 3863), ('arabic', 'التهاب مجاري تنفسية سفلى, التهاب قصبات'), ('google_translate', 'Inflammation of lower respiratory tracts, bronchitis'), ('human_translate', 'bronchiolitis, bronchitis'), ('normalized', None), ('appears_in', "['diagnosis']"), ('google_translate_feb', 'Inflammation of lower respiratory tracts, bronchitis'), ('google_tokens_joined', 'lower respiratory tract infection, bronchitis'), ('orig_value', None), ('added', None), ('flag_ENT', None), ('flag_abdomen', None), ('flag_abdominal_pain', None), ('flag_allergy', None), ('flag_anemia', None), ('flag_animal_insect_bite', None), ('flag_back', None), ('flag_blast', None), ('flag_bleed', None), ('flag_blunt', None), ('flag_burn', None), ('flag_cancer', None), ('flag_cardiovascular', None), ('flag_chest', None), ('flag_complication', None), ('flag_comprehensive_injury', '1'), ('flag_conflict_related', None), ('flag_congenital',

#### The following takes a while to run...

In [158]:
# The insert_many method inserts in chunks of 1000, but this specifies that we don't want
# to start the process until we have this many records to insert.
buffer_size = 50000

flags_to_insert = []

try:
    db['full_raw_flags'].drop()
except:
    pass

tab_raw_flags = db['full_raw_flags']

# Insert a dummy record to create the table
dummy_record = {
    'file_id':None,
    'files_sheets_id':None,
    'sheet_id':None
}

for flag in flag_cols:
    dummy_record[flag] = None
    
tab_raw_flags.insert(dummy_record)
print(tab_raw_flags.count())
tab_raw_flags.delete()


# Iterate through the raw records one by one
for rec in tab_raw_ar.find():
    
    # Include foreign keys that allow us to query against the flag table instead of 
    # joining with the raw data table, which is slow.
    flag_record = {
        'id':rec['id'],
        'file_id':rec['a_file_id'],
        'files_sheets_id':rec['a_files_sheets_id'],
        'sheet_id':rec['a_sheet_id']
    }
    
    # Initialize each flag_record
    for flag in flag_cols:
        flag_record[flag] = None
        
    # Scan the conflict related column for values, but do this before looking at the
    # corresponding Arabic values so that we don't overwrite the Arabic value setting.
    if rec['conflict_related'] is not None:
        if rec['conflict_related'].strip() == 'كبرى' or rec['conflict_related'].strip() =='كبرى':
            flag_record['flag_conflict_related'] = 1
        elif rec['conflict_related'].strip() == 'لا':
            flag_record['flag_conflict_related'] = 0
        else:
            flag_record['flag_conflict_related'] = None
    else:
        flag_record['flag_conflict_related'] = None
        
    # Loop through the variables for each raw data record
    for v in variables:
        # These are obfuscated PII cols, or the flag columns we're ignoring, so skip them
        if 'info_' in v or 'flag_' in v or v == 'id':
            continue
        
        # Get the value in the column
        to_lookup = rec[v]
        
        if to_lookup is None or to_lookup == '.':
            continue
        else:
            
            # We have a legit value, so look it up and grab the flags
            try:
                # There might be a keyerror on the info_ columns' hashed values, etc.
                # I also manually removed some PII from arabic_values, so that might
                # cause an occassional mismatch.
                arabic_values_rec = arabic_lookup[to_lookup]
                for flag in flag_cols:
                    # Should be None if not flagged, so just check for existence
                    if arabic_values_rec[flag]:
                        flag_record[flag] = arabic_values_rec[flag]
            except:
                pass
    
    # Store the record
    flags_to_insert.append(flag_record)

    # Check if we need to insert
    if len(flags_to_insert) > buffer_size:
        tab_raw_flags.insert_many(flags_to_insert)
        
        # Clear the buffer
        flags_to_insert.clear()
        
# We've been through all raw records so make sure the buffer is clear
tab_raw_flags.insert_many(flags_to_insert)
flags_to_insert.clear()

1


In [159]:
# We no longer have use for full_raw_english, so drop it

tab_eng = db['full_raw_english']
tab_eng.drop()

In [160]:
# Guarantee that we've cleaned up

# Fix a few minor spelling errors in the facilities table

result = db.query("""
UPDATE facilities SET district = 'Idlib' WHERE district = 'Idleb';
""")

result = db.query("""
UPDATE facilities SET subdistrict = 'Idlib' WHERE subdistrict = 'Idleb';
""")

result = db.query("""
UPDATE facilities SET district = 'Jisr Ash Shugar' WHERE district = 'Jisr-Ash-Shugur';
""")

result = db.query("""
UPDATE facilities SET subdistrict = 'Jisr Ash Shugar' WHERE subdistrict = 'Jisr-Ash-Shugur';
""")

result = db.query("""
UPDATE facilities SET district = 'Daraa' WHERE district = "Dar'a";
""")

result = db.query("""
UPDATE facilities SET subdistrict = 'Daraa' WHERE subdistrict = "Dar'a";
""")

result = db.query("""
UPDATE facilities SET district = 'Al Mara' WHERE district = "Al Ma'ra";
""")

In [161]:
# Update to location information match on facility_code
new_data = [
    ("SAMS002","Abu Fadel","Syria","Daraa","Daraa","Dael "),
    ("SAMS010","Al Ehsan","Syria","Daraa","Daraa","Kherbet Ghazala"),
    ("SAM011","Al Ehsan Clinic","Syria","Daraa","Daraa","Kherbet Ghazala"),
    ("SAMS010","Al Ehsan RH","Syria","Daraa","Daraa","Kherbet Ghazala"),
    ("SAMS013","Al Hara","Syria","Daraa","As-Sanamayn","As-Sanamayn"),
    ("SAMS015","Al Herak","Syria","Daraa","Izra","Herak"),
    ("SAMS015","Al Herak RH","Syria","Daraa","Izra","Herak"),
    ("SAMS017","Al Jeza","Syria","Daraa","Daraa","Jizeh"),
    ("SAMS017","Al Jeza Clinic","Syria","Daraa","Daraa","Jizeh"),
    ("SAMS024","Al Msaifra","Syria","Daraa","Daraa","msaifra"),
    ("SAMS027","Al Noor","Syria","Daraa","Daraa","Daraa"),
    ("SAMS030","Al Rafed","Syria","Quneitra","Quneitra","Al-Khashniyyeh"),
    ("SAMS030","Al Rafed RH","Syria","Quneitra","Quneitra","Al-Khashniyyeh"),
    ("SAMS030","Al Rafed SAMS Clinic","Lebanon","Quneitra","Quneitra","Al-Khashniyyeh"),
    ("SAMS032","Al Redwan","Syria","Daraa","Izraa","Jasim"),
    ("SAMS032","Al Redwan Clinic","Syria","Daraa","Izraa","Jasim"),
    ("SAMS035","Al Salam Hospital","Syria","Daraa","Daraa","Daraa"),
    ("SAMS036","Al Salam Midwife","Syria","Daraa","Daraa","Daraa"),
    ("SAMS037","Al Yadudeh","Syria","Daraa","Daraa","Mzeireb"),
    ("SAMS038","Al Yaman","Syria","Rural Damascus","Rural Damascus","Kafr Batna"),
    ("SAMS060","Ankhal","Syria","Daraa","As-Sanamayn","As-Sanamayn"),
    ("SAMS062","Artificial Limbs Center (Farha)","Syria","Rural Damascus","Rural Damascus","Kafr Batna"),
    ("SAMS068","Beer Ajam","Syria","Quneitra","Quneitra","Quneitra"),
    ("SAMS073","Douma Dialysis","Syria","Rural Damascus","Duma","Duma"),
    ("SAMS074","Douma FH","Syria","Rural Damascus","Duma","Duma"),
    ("SAMS075","Douma OBGYN","Syria","Rural Damascus","Duma","Duma"),
    ("SAMS076","East Ghouta ICU","Syria","Rural Damascus","Rural Damascus","Kafr Batna"),
    ("SAMS077","Eissa Ajaj","Syria","Daraa","Daraa","Daraa"),
    ("SAMS079","Erbin FH","Syria","Rural Damascus","Rural Damascus","Rural Damascus"),
    ("SAMS087","Hit Med Point","Syria","Daraa","Daraa","Ash-Shajara"),
    ("SAMS090","Ibta RH","Syria","Daraa","Daraa","Dael"),
    ("SAMS097","Ishraqat Amal PSS","Syria","Rural Damascus","Rural Damascus","Rural Damascus"),
    ("SAMS101","Jassim","Syria","Daraa","Izra","Jasim"),
    ("SAMS102","Jbt Khashab","Syria","Quneitra","Quneitra","Khan Arnaba"),
    ("SAMS105","Jlein Med Point","Syria","Daraa","Daraa","Mzeireb"),
    ("SAMS106","Jobar Med Center","Syria","Rural Damascus","Rural Damascus","Kafr Batna"),
    ("SAMS142","Maaraba","Syria","Daraa","Daraa","Busra Esh-Sham"),
    ("SAMS143","Maaraba RH","Syria","Daraa","Daraa","Busra Esh-Sham"),
    ("SAMS153","Muwafeq Dakhl Alla ","Syria","Daraa","Izra","Tassil"),
    ("SAMS154","Muwafeq Dakhl Alla Clinic","Syria","Daraa","Izra","Tassil"),
    ("SAMS155","Nabd Horan ","Syria","Daraa","Daraa","Dael "),
    ("SAMS156","Nabd Horan Clinic","Syria","Daraa","Daraa","Dael "),
    ("SAMS159","Nawa","Syria","Daraa","Izra","Nawa"),
    ("SAMS160","Nawa Clinic","Syria","Daraa","Izra","Nawa"),
    ("SAMS161","Neonatal ICU-hamoria","Syria","Rural Damascus","Rural Damascus","Kafr Batna"),
    ("SAMS162","Nuaima","Syria","Daraa","Daraa","Daraa"),
    ("SAMS173","Rawan Birth Center","Syria","Daraa","Daraa","Ash-Shajara"),
    ("SAMS175","Sahm Al Jolan Clinic","Syria","Daraa","Daraa","Ash-Shajara"),
    ("SAMS176","Saida","Syria","Daraa","Daraa","Sayda"),
    ("SAMS177","Saida PSS","Syria","Daraa","Daraa","Sayda"),
    ("SAMS185","Sham ","Syria","Rural Damascus","Rural Damascus","Arbin"),
    ("SAMS186","Shifa","Jordan","Rural Damascus","Duma","Duma"),
    ("SAMS187","Shifa Mobile hospital","Jordan","Rural Damascus","Duma","Duma"),
    ("SAMS194","Shohadaa Horan","Syria","Daraa","Daraa","Daraa"),
    ("SAMS192","Tafas Clinic","Syria","Daraa","Daraa","Mzeireb"),
    ("SAMS193","Tafas PSS","Syria","Daraa","Daraa","Mzeireb"),
    ("SAMS195","Tal Shehab","Syria","Daraa","Daraa","Mzeireb"),
    ("SAMS198","Tassil RH","Syria","Daraa","Izra","Tassil"),
    ("SAMS201","Um Walad","Syria","Daraa","Daraa","Mseifra"),
    ("SAMS203","Wadi Al Yarmouk","Syria","Daraa","Daraa","Ash-Shajara"),
    ("SAMS204","Wadi Al Yarmouk FH","Syria","Daraa","Daraa","Ash-Shajara"),
    ("SAMS205","White Hands PSS","Syria","Daraa","Daraa","Mzeireb"),
    ("SAMS208","Zayzun","Syria","Daraa","Daraa","Mzeireb"),
    ("SAMS209","Zayzun","Syria","Daraa","Daraa","Mzeireb")
]

def make_query_string(t, field, index):
    s = "UPDATE facilities SET " + field + " ='" + t[index] + "' WHERE facility_code = '" + t[0] + "';"
    return s

query_queue = []

for facility in new_data:
    query_queue.append(make_query_string(facility, "country", 2))
    query_queue.append(make_query_string(facility, "governorate", 3))
    query_queue.append(make_query_string(facility, "district", 4))
    query_queue.append(make_query_string(facility, "subdistrict", 5))
    
for query in query_queue:
    result = db.query(query)

In [162]:
query_queue = [
    "UPDATE files SET facility_id = 124 WHERE id IN (594, 599, 747, 947);",
    "UPDATE files SET facility_id = 71  WHERE id IN (42);",
    "UPDATE files SET facility_id = 312 WHERE id IN (627, 1023);",
    "UPDATE files SET facility_id = 119 WHERE id IN (635, 796, 884, 956, 1102);",
    "UPDATE files SET facility_id = 88  WHERE id IN (668, 1039, 1041, 1042);",
    "UPDATE files SET facility_id = 146 WHERE id IN (719, 874, 940);",
    "UPDATE files SET facility_id = 118 WHERE id IN (806, 891, 964, 1034);",
    "UPDATE files SET facility_id = 92  WHERE id IN (834);",
    "UPDATE files SET facility_id = 129 WHERE id IN (1061, 1120);",
    "UPDATE files SET facility_id = 3   WHERE id IN (981);"
]

for query in query_queue:
    result = db.query(query)

### Export full flags table to CSV

In [163]:
# This used to be a part of dataset but was extracted to its own library
# https://github.com/pudo/datafreeze
from datafreeze import freeze

# Export database table to CSV
import csv

In [164]:
# You can change this query to export a different set of data
result = db.query("""
SELECT  files.id as files_id,
        files.year,
        files.month,
        files.year || '-' || files.month || '-01' AS full_date,
        facilities.id AS facility_id,
        facilities.facility_parent_id,
        facilities.facilityname,
        facilities.country,
        facilities.governorate,
        facilities.district,
        facilities.subdistrict,
        facilities.facility_type,
        full_raw_flags.flag_abdomen,
        full_raw_flags.flag_abdominal_pain,
        full_raw_flags.flag_allergy,
        full_raw_flags.flag_anemia,
        full_raw_flags.flag_animal_insect_bite,
        full_raw_flags.flag_back,
        full_raw_flags.flag_blast,
        full_raw_flags.flag_bleed,
        full_raw_flags.flag_blunt,
        full_raw_flags.flag_burn,
        full_raw_flags.flag_cancer,
        full_raw_flags.flag_cardiovascular,
        full_raw_flags.flag_chest,
        full_raw_flags.flag_complication,
        full_raw_flags.flag_conflict_related,
        full_raw_flags.flag_congenital,
        full_raw_flags.flag_constipation,
        full_raw_flags.flag_dehydration,
        full_raw_flags.flag_dental_complaint,
        full_raw_flags.flag_derm,
        full_raw_flags.flag_diabetes,
        full_raw_flags.flag_diarrhea_dysentery,
        full_raw_flags.flag_endocrine,
        full_raw_flags.flag_ENT,
        full_raw_flags.flag_explosive,
        full_raw_flags.flag_eye,
        full_raw_flags.flag_facial,
        full_raw_flags.flag_fatigue,
        full_raw_flags.flag_fever,
        full_raw_flags.flag_follow_up,
        full_raw_flags.flag_fracture,
        full_raw_flags.flag_gi_complaint,
        full_raw_flags.flag_growth_delay,
        full_raw_flags.flag_gu,
        full_raw_flags.flag_gunshot,
        full_raw_flags.flag_gyn_women,
        full_raw_flags.flag_head,
        full_raw_flags.flag_headache,
        full_raw_flags.flag_history_of,
        full_raw_flags.flag_hyperlipidemia,
        full_raw_flags.flag_infection,
        full_raw_flags.flag_injury,
        full_raw_flags.flag_injury_neuro,
        full_raw_flags.flag_liver_dysfunction,
        full_raw_flags.flag_lower_extremity,
        full_raw_flags.flag_malnutrition,
        full_raw_flags.flag_mental_health,
        full_raw_flags.flag_musculoskeletal_pain,
        full_raw_flags.flag_nausea_vomiting,
        full_raw_flags.flag_neck,
        full_raw_flags.flag_nerve,
        full_raw_flags.flag_neuro_complaint,
        full_raw_flags.flag_neurologic,
        full_raw_flags.flag_orthopedic,
        full_raw_flags.flag_other_infection,
        full_raw_flags.flag_pain,
        full_raw_flags.flag_pelvic,
        full_raw_flags.flag_pregnancy,
        full_raw_flags.flag_renal,
        full_raw_flags.flag_respiratory,
        full_raw_flags.flag_shrapnel,
        full_raw_flags.flag_spinal,
        full_raw_flags.flag_spine,
        full_raw_flags.flag_stab,
        full_raw_flags.flag_stroke,
        full_raw_flags.flag_suspected,
        full_raw_flags.flag_traffic_accident,
        full_raw_flags.flag_trauma,
        full_raw_flags.flag_upper_extremity,
        full_raw_flags.flag_urologic,
        full_raw_flags.flag_vascular,
        full_raw_flags.flag_wound,
        full_raw_flags.flag_comprehensive_injury

FROM full_raw_flags
JOIN files on files.id = full_raw_flags.file_id
JOIN facilities on files.facility_id = facilities.id

WHERE files.facility_id IS NOT NULL 
AND files.month IS NOT NULL
AND files.skipped = 0
AND files.ignore = 0;
""")

# This used to be a part of dataset but was extracted to its own library
# https://github.com/pudo/datafreeze
freeze(result, format='csv', filename='full_raw_flags.csv')

In [165]:
# This is optional and will generate a copy of the database that will be gigabytes in size.
shutil.copy2("sams_data_phase22.sqlite",'sams_data_phase22_output_2018-04-05.sqlite')

'sams_data_phase22_output_2018-04-05.sqlite'