### Fix file issues and re-export data

There were a few files with missing facility_id and other fields. This notebook fixes the issues and re-exports the data for Ranya.

In [6]:
# For copying files and working with file directories
import os
import shutil

# Regular expressions
# You can use these for pattern matching if you're so inclined
import re

# Connect to a SQLite database in a lazy manner.
import dataset

# This used to be a part of dataset but was extracted to its own library
# https://github.com/pudo/datafreeze
from datafreeze import freeze

# Export database table to CSV
import csv

In [8]:
# What do you want to name the database you create after running the script?
# This will delete that if it exists and then create a new copy of the baseline
# database and make alterations to it. This is a method of ensuring that the
# original database is not mistakenly overwritten

# Name format: whatever_name_you_want.sqlite
original_db_name = "sams_data_phase21_template.sqlite"
new_db_name = "sams_data_phase21.sqlite"

try:
    os.remove(new_db_name)
    print("Removed", new_db_name)
except:
    pass

try:
    # Try to preserve a copy in case there is a problem and it has to be restored
    shutil.copy2(original_db_name, new_db_name)
    
    print("Created", new_db_name,"from template:", original_db_name)
except:
    pass

# All operations will be on the new database, not the original source one

Created sams_data_phase21.sqlite from template: sams_data_phase21_template.sqlite


In [9]:
# Create a connection to the database
db = dataset.connect("sqlite:///" + new_db_name)

In [10]:
# Get a reference to the arabic_values table
tab_arabic_values = db['arabic_values']

In [11]:
query_queue = [
    "UPDATE files SET facility_id = 168 WHERE id IN (266);",
    "UPDATE files SET facility_id = 54 WHERE id IN (270);",
    "UPDATE files SET facility_id = 129 WHERE id IN (994);",
    "UPDATE files SET facility_id = 310 WHERE id IN (259);",
]

for query in query_queue:
    result = db.query(query)

### Export full flags table, full_raw_flags, to CSV

This is a big data set and might take a few minutes. Resulting CSV will be ~ 160mb.

You should be able to pull this into python, R, Tableau, etc... for analysis. It probably has too many records to open in Excel.

In [12]:
# You can change this query to export a different set of data
result = db.query("""
SELECT  files.id as files_id,
        files.year,
        files.month,
        files.year || '-' || files.month || '-01' AS full_date,
        facilities.id AS facility_id,
        facilities.facility_parent_id,
        facilities.facilityname,
        facilities.country,
        facilities.governorate,
        facilities.district,
        facilities.subdistrict,
        facilities.facility_type,
        full_raw_flags.flag_abdomen,
        full_raw_flags.flag_abdominal_pain,
        full_raw_flags.flag_allergy,
        full_raw_flags.flag_anemia,
        full_raw_flags.flag_animal_insect_bite,
        full_raw_flags.flag_back,
        full_raw_flags.flag_blast,
        full_raw_flags.flag_bleed,
        full_raw_flags.flag_blunt,
        full_raw_flags.flag_burn,
        full_raw_flags.flag_cancer,
        full_raw_flags.flag_cardiovascular,
        full_raw_flags.flag_chest,
        full_raw_flags.flag_complication,
        full_raw_flags.flag_conflict_related,
        full_raw_flags.flag_congenital,
        full_raw_flags.flag_constipation,
        full_raw_flags.flag_dehydration,
        full_raw_flags.flag_dental_complaint,
        full_raw_flags.flag_derm,
        full_raw_flags.flag_diabetes,
        full_raw_flags.flag_diarrhea_dysentery,
        full_raw_flags.flag_endocrine,
        full_raw_flags.flag_ENT,
        full_raw_flags.flag_explosive,
        full_raw_flags.flag_eye,
        full_raw_flags.flag_facial,
        full_raw_flags.flag_fatigue,
        full_raw_flags.flag_fever,
        full_raw_flags.flag_follow_up,
        full_raw_flags.flag_fracture,
        full_raw_flags.flag_gi_complaint,
        full_raw_flags.flag_growth_delay,
        full_raw_flags.flag_gu,
        full_raw_flags.flag_gunshot,
        full_raw_flags.flag_gyn_women,
        full_raw_flags.flag_head,
        full_raw_flags.flag_headache,
        full_raw_flags.flag_history_of,
        full_raw_flags.flag_hyperlipidemia,
        full_raw_flags.flag_infection,
        full_raw_flags.flag_injury,
        full_raw_flags.flag_injury_neuro,
        full_raw_flags.flag_liver_dysfunction,
        full_raw_flags.flag_lower_extremity,
        full_raw_flags.flag_malnutrition,
        full_raw_flags.flag_mental_health,
        full_raw_flags.flag_musculoskeletal_pain,
        full_raw_flags.flag_nausea_vomiting,
        full_raw_flags.flag_neck,
        full_raw_flags.flag_nerve,
        full_raw_flags.flag_neuro_complaint,
        full_raw_flags.flag_neurologic,
        full_raw_flags.flag_orthopedic,
        full_raw_flags.flag_other_infection,
        full_raw_flags.flag_pain,
        full_raw_flags.flag_pelvic,
        full_raw_flags.flag_pregnancy,
        full_raw_flags.flag_renal,
        full_raw_flags.flag_respiratory,
        full_raw_flags.flag_shrapnel,
        full_raw_flags.flag_spinal,
        full_raw_flags.flag_spine,
        full_raw_flags.flag_stab,
        full_raw_flags.flag_stroke,
        full_raw_flags.flag_suspected,
        full_raw_flags.flag_traffic_accident,
        full_raw_flags.flag_trauma,
        full_raw_flags.flag_upper_extremity,
        full_raw_flags.flag_urologic,
        full_raw_flags.flag_vascular,
        full_raw_flags.flag_wound,
        full_raw_flags.flag_comprehensive_injury

FROM full_raw_flags
LEFT OUTER JOIN files on files.id = full_raw_flags.file_id
LEFT OUTER JOIN facilities on files.facility_id = facilities.id

WHERE files.facility_id IS NOT NULL 
AND files.month IS NOT NULL
AND files.skipped = 0
AND files.ignore = 0;
""")

# This used to be a part of dataset but was extracted to its own library
# https://github.com/pudo/datafreeze
freeze(result, format='csv', filename='2018-03-14_full_raw_flags.csv')

### Back up the derived SQLite database

In [13]:
# This is optional and will generate a copy of the database that will be gigabytes in size.
shutil.copy2(new_db_name,'sams_data_phase22_template.sqlite')

'sams_data_phase22_template.sqlite'