In [1]:
import logging
import pandas as pd
from pandas import Series
from iso3166 import countries
from datetime import datetime
from csv import QUOTE_NONNUMERIC
from unicef_schools_attribute_cleaning.pandas.dataframe_cleaner import dataframe_cleaner

# make INFO logs visible
logging.basicConfig(level=logging.INFO)

In [2]:
src_df = pd.read_csv('../../data/UNICE_schools_raw_2020_Jun/2019_ASC_schools.csv', low_memory=False)
src_df

Unnamed: 0,duration,enumerator,new_school,idregion,iddistrict,idcouncil,idchiefdom,idward,idsection,idtown,...,geopoint2longitude,geopoint2altitude,geopoint2accuracy,comment_op,formdef_version,key,submissiondate,starttime,endtime,date
0,1740,Morison Mohamed Momoh,School on the list,EASTERN,KAILAHUN,KAILAHUN DISTRICT,DEA,17,SEINGA,SIENGA,...,-10.62374,258.7,5.0,,1.906140e+09,uuid:5cb166c9-05bf-4b8a-94e9-6566c8ee4fcb,6/29/2019 0:25,6/24/2019 9:37,6/24/2019 10:06,6/24/2019 9:37
1,4737,Morison Mohamed Momoh,New School with Paper Questionnaire,EASTERN,KAILAHUN,KAILAHUN DISTRICT,DEA,15,BAIWALA,BAIWALA,...,,,,,1.906140e+09,uuid:ca4fb19e-bed3-4a94-a0d4-c55b69f5d7c9,6/29/2019 0:25,6/24/2019 11:04,6/25/2019 22:08,6/25/2019 11:04
2,2748,Morison Mohamed Momoh,School on the list,EASTERN,KAILAHUN,KAILAHUN DISTRICT,DEA,17,SIENGA,GBAHAMA,...,-10.67104,248.3,5.0,,1.906140e+09,uuid:2872c576-f82e-4896-925a-ad22d1af54f4,6/21/2019 22:50,6/19/2019 12:39,6/19/2019 13:30,6/19/2019 12:40
3,1730,Morison Mohamed Momoh,School on the list,EASTERN,KAILAHUN,KAILAHUN DISTRICT,DEA,17,BAIWALA,BAIWALLA,...,-10.64993,216.9,5.0,,1.906140e+09,uuid:4b782dd1-9796-4ee0-8f9f-e3902045efdb,6/29/2019 0:25,6/25/2019 10:06,6/25/2019 10:35,6/25/2019 10:06
4,3143,Morison Mohamed Momoh,School on the list,EASTERN,KAILAHUN,KAILAHUN DISTRICT,DEA,17,SIENGA,TAKPOIMA,...,-10.66492,240.3,5.0,,1.906140e+09,uuid:7ff82373-9bbb-4c48-9150-f6081d8d326b,6/21/2019 22:50,6/19/2019 14:36,6/19/2019 15:32,6/19/2019 14:36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11183,,,,,,,,,,,...,,,,,,,,,,
11184,,,,,,,,,,,...,,,,,,,,,,
11185,,,,,,,,,,,...,,,,,,,,,,
11186,,,,,,,,,,,...,,,,,,,,,,


In [3]:
# ^ Problem 1: there are 3764 columns in the source data. This makes it difficult to correctly map columns onto the unicef schema.
# The fuzzy matching in dataframe_cleaner will collect some of them.

In [4]:
# Problem 2: the 'date' values are not an ISO format, and also have NaN float values- so cannot be automatically parsed without formatting.
src_df['date']

0         6/24/2019 9:37
1        6/25/2019 11:04
2        6/19/2019 12:40
3        6/25/2019 10:06
4        6/19/2019 14:36
              ...       
11183                NaN
11184                NaN
11185                NaN
11186                NaN
11187                NaN
Name: date, Length: 11188, dtype: object

In [5]:
def fix_date(value):
    if not isinstance(value, str):
        return None
    return datetime.strptime(value, '%m/%d/%Y %H:%M')

In [6]:
preprocess_df = src_df
preprocess_df['date'] = preprocess_df['date'].apply(fix_date)
preprocess_df['date']

0       2019-06-24 09:37:00
1       2019-06-25 11:04:00
2       2019-06-19 12:40:00
3       2019-06-25 10:06:00
4       2019-06-19 14:36:00
                ...        
11183                   NaT
11184                   NaT
11185                   NaT
11186                   NaT
11187                   NaT
Name: date, Length: 11188, dtype: datetime64[ns]

In [7]:
# run the dataframe_cleaner
country = countries.get('SL')
df = dataframe_cleaner(
    dataframe=preprocess_df,
    country=country,
    is_private=True,
    provider="ASC",
    provider_is_private=True
)
df

INFO:unicef_schools_attribute_cleaning.pandas.dataframe_cleaner:copying dataframe...
INFO:unicef_schools_attribute_cleaning.pandas.dataframe_cleaner:standardizing column names...
INFO:unicef_schools_attribute_cleaning.pandas.standardize_column_names:uuid column not found, generating uuid4
INFO:unicef_schools_attribute_cleaning.pandas.standardize_column_names:renaming columns: {'computers_number': 'num_computers',
 'geopointaltitude': 'altitude',
 'geopointlatitude': 'lat',
 'geopointlongitude': 'lon',
 'idschool_name': 'name',
 'respondent_name': 'person_contact',
 'sch_email': 'email',
 'sch_type': 'educ_level',
 'verify_district': 'admin3',
 'water_source': 'water'}
INFO:unicef_schools_attribute_cleaning.pandas.standardize_column_names:adding 31 columns from schema: ['admin0',
 'admin1',
 'admin2',
 'admin4',
 'admin_code',
 'admin_id',
 'address',
 'address2',
 'phone_number',
 'postal_code',
 'gps_confidence',
 'num_students',
 'num_teachers',
 'connectivity',
 'type_connectivity',

Unnamed: 0,country_code,admin0,admin1,admin2,admin3,admin4,admin_code,admin_id,name,address,...,description,last_update,tower_dist,tower_type_service,tower_type,tower_code,tower_latitude,tower_longitude,is_private,uuid
0,SL,Sierra Leone,Eastern,Kailahun,Dia,,SLE.1.1.1_1,"3.6,SLE,GID_3=SLE.1.1.1_1",ALQUDUS ISLAMIC PRIMARY SCHOOL (SEINGA SECTI...,,...,,,,,,,,,True,757e76de-dfe6-49ca-8b11-2a9d190398f3
1,SL,Sierra Leone,Eastern,Kailahun,Dia,,SLE.1.1.1_1,"3.6,SLE,GID_3=SLE.1.1.1_1",BADRUDEEN ISLAMIC PRIMARY SCHOOL,,...,,,,,,,,,True,3727fe76-c67f-48c3-b65d-657616a9f27f
2,SL,Sierra Leone,Eastern,Kailahun,Malema,,SLE.1.1.7_1,"3.6,SLE,GID_3=SLE.1.1.7_1","COMMUNITY PRIMARY SCHOOL (SIENGA SECTION, GBA...",,...,,,,,,,,,True,99fc22de-eeb7-46f1-b7c8-9849a7c09c35
3,SL,Sierra Leone,Eastern,Kailahun,Dia,,SLE.1.1.1_1,"3.6,SLE,GID_3=SLE.1.1.1_1",EARLY CHILDHOOD CARE AND DEVELOPMENT CENTRE (...,,...,,,,,,,,,True,24e02753-e447-406c-9182-fbf96c52fe83
4,SL,Sierra Leone,Eastern,Kailahun,Malema,,SLE.1.1.7_1,"3.6,SLE,GID_3=SLE.1.1.7_1",KAILAHUN DISTRICT EDUCATION COUNCIL PRIMARY...,,...,,,,,,,,,True,891a61d9-5331-4cb4-b211-eb147b173163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11175,SL,Sierra Leone,Western,Western Urban,Freetown1,,SLE.4.2.1_1,"3.6,SLE,GID_3=SLE.4.2.1_1",WILBERFORCE PRIMARY MUNICIPAL SCHOOL - INFANT...,,...,,,,,,,,,True,e90c3b58-7775-4e93-b741-9aac7200a2ae
11176,SL,Sierra Leone,Western,Western Urban,Freetown1,,SLE.4.2.1_1,"3.6,SLE,GID_3=SLE.4.2.1_1",WILBERFORCE PRIMARY MUNICIPAL SCHOOL INFANTS ...,,...,,,,,,,,,True,1a48af34-46e7-4f03-9c7b-4d9beaa9ccc9
11177,SL,Sierra Leone,Western,Western Urban,Freetown1,,SLE.4.2.1_1,"3.6,SLE,GID_3=SLE.4.2.1_1",WONDERLAND NURSERY AND PREPARATORY SCHOOL (MU...,,...,,,,,,,,,True,7d71201e-0f2f-4c25-9f3d-24c5621d8a0d
11178,SL,Sierra Leone,Western,Western Urban,Freetown1,,SLE.4.2.1_1,"3.6,SLE,GID_3=SLE.4.2.1_1",WONDERS INTERNATIONAL ACADEMY PRESCHOOL (HIL...,,...,,,,,,,,,True,a960f6d2-50d1-4f71-b069-d385aafaf3a4


In [8]:
df.to_csv('sierra_leone_cleaned.csv', quoting=QUOTE_NONNUMERIC, index=False)

In [9]:
# open in LibreOffice, Excel, other
!open sierra_leone_cleaned.csv