In [1]:
import logging
import pandas as pd
from pandas import Series
from iso3166 import countries
from csv import QUOTE_NONNUMERIC
from unicef_schools_attribute_cleaning.pandas.dataframe_cleaner import dataframe_cleaner

# make INFO logs visible
logging.basicConfig(level=logging.INFO)

In [2]:
src_df = pd.read_csv('../../data/UNICE_schools_raw_2020_Jun/rwandan_schools_gis_data_4g_coverage_status.csv', low_memory=False)
src_df

Unnamed: 0,NO,PROVINCE,DISTRICT,SECTOR,CELL,VILLAGE,School Name,Settings in School,PointID (GPS),Degree,Minute,Seconds,Degree_2,Minute_2,Seconds_2,Description,Latitude,Longitude,4G Status
0,1,City of Kigali,Nyarugenge,Gitega,Akabahizi,Iterambere,E.M CYAPEPE,Preprimary,049,1,56.0,41.5,30,3.0,7.8,School Center,-1.9448611111111112,30.05216666666667,covered
1,2,City of Kigali,Nyarugenge,Gitega,Akabahizi,Gihanga,E.M ITETERO,Preprimary,056,1,56.0,56.1,30,3.0,12.7,School Center,-1.9489166666666666,30.053527777777777,covered
2,3,City of Kigali,Nyarugenge,Gitega,Akabahizi,Iterambere,G.S CYAHAFI,9 YBE,048,1,56.0,41.0,30,3.0,12.0,NEAR THE ENTRANCE,-1.9447222222222222,30.053333333333335,covered
3,4,City of Kigali,Nyarugenge,Gitega,Kinyange,Uburezi,E.P GITEGA,Primary,051,1,57.0,35.2,30,3.0,22.7,HEAD TEACHER OFFICE,-1.9597777777777776,30.056305555555557,covered
4,5,City of Kigali,Nyarugenge,Gitega,Kora,Kivumu,ETO MUHAZI I,Secondary A Level,047,1,57.0,9.8,30,3.0,26.7,HEAD TEACHER OFFICE,-1.952722222222222,30.05741666666667,covered
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4229,4230,Northern,Burera,Rwerere,Gacundura,Gacundura,E.P GACUNDURA,Primary,64,1,30.0,9.3,29,51.0,22.9,FLAG POLE,-1.5025833333333334,29.856361111111113,covered
4230,4231,Northern,Burera,Rwerere,Gashoro,Rwerere,G.S CYAPA,12 YBE,68,1,29.0,50.7,29,51.0,47.4,School Center,-1.4974166666666666,29.863166666666668,covered
4231,4232,Northern,Burera,Rwerere,Gashoro,Cyapa,E.M CYAPA,Preprimary,070,1,29.0,52.0,29,51.0,47.8,School Center,-1.4977777777777779,29.86327777777778,covered
4232,4233,Northern,Burera,Rwerere,Ruconsho,Ruconsho,E.P RWERERE,Primary,66,1,31.0,47.0,29,52.0,55.0,FLAG POLE,-1.5297222222222224,29.881944444444446,covered


In [3]:
# Problem 1: there are 87 unique strings in the 'settings in school' column. 
# this poses a problem for what's supposed to be enumerated type for "EducationLevel" (primary, secondary) in the database, 
# however dataframe_cleaner's fuzzy matching enum will translate many of them 
src_df['Settings in School'].unique()

array(['Preprimary', '9 YBE', 'Primary', 'Secondary A Level', '12 YBE',
       'Preprimary, 12 YBE', 'Preprimary, Primary ', 'Preprimary, 9 YBE',
       'VCT', 'Secondary (O&A) Level', 'Secondary', 'Polytechnic',
       'University', 'University ', 'VTC', 'Preprimary, Primary',
       'Pre Primary only', nan, 'College', 'Secondary (O&A) Levels',
       'VTC/TSS ', 'Technical', 'People with Disabilities', 'TVT', 'TSS',
       'TVET', "Secondary A' level", 'Secondary (O&A) level',
       'Secondary A level', ' TSS & VTC', 'Secondary  A Level',
       'Primary ', 'Pre primary, Primary', 'Secondary (O&A)Level',
       'Secondary (O&A)Levels', 'Preprimary ', 'TSS, VTC',
       'Secondary ALevel', 'Secondary O Level', 'Prepramary', 'TSS &VTC',
       'Secondary (O&A) Level ', 'Secondary, Short Courses', 'Institute',
       'COLLEGE', 'TECHNICAL', '12YBE', '9YBE', 'School of Nursing',
       'Preprimary , 12 YBE', "Secondary A'level", 'Preprimary , 9 YBE',
       'Pre primary', 'Secondary(O&A

In [5]:
# Problem 2: the connectivity type (4G) is encoded in the column name, not in field values.
# The dataframe_cleaner does not support this. We must manually pre-process with pandas.
src_df['4G Status'].unique()

array(['covered', 'Not covered'], dtype=object)

In [6]:
# manual pandas workaround for problem 2: create a pandas function to move the 4g status into field values
def _4g_updater(row: Series) -> Series:
    _4g_status = row['4G Status'].lower()
    result = Series(data=row)
    if _4g_status == "covered":
        result['connectivity'] = True
        result['type_connectivity'] = '4G'
        result['tower_type_service'] = '4G'
    elif _4g_status == "not covered":
        result['connectivity'] = False
        result['type_connectivity'] = None
        result['tower_type_service'] = None
    return result

preprocessed_df = src_df.apply(_4g_updater, axis=1)
preprocessed_df[['4G Status', 'connectivity', 'type_connectivity', 'tower_type_service']]

Unnamed: 0,4G Status,connectivity,type_connectivity,tower_type_service
0,covered,True,4G,4G
1,covered,True,4G,4G
2,covered,True,4G,4G
3,covered,True,4G,4G
4,covered,True,4G,4G
...,...,...,...,...
4229,covered,True,4G,4G
4230,covered,True,4G,4G
4231,covered,True,4G,4G
4232,covered,True,4G,4G


In [7]:
# run the dataframe_cleaner
country = countries.get('RW')
df = dataframe_cleaner(
    dataframe=preprocessed_df, 
    country=country,
    is_private=True,
    provider="unicef",
    provider_is_private=True
)

INFO:unicef_schools_attribute_cleaning.pandas.dataframe_cleaner:copying dataframe...
INFO:unicef_schools_attribute_cleaning.pandas.dataframe_cleaner:standardizing column names...
INFO:unicef_schools_attribute_cleaning.pandas.standardize_column_names:uuid column not found, generating uuid4
INFO:unicef_schools_attribute_cleaning.pandas.standardize_column_names:renaming columns: {'DISTRICT': ('admin3', 'alias exact match'),
 'Description': ('description', 'fuzzy match 100%'),
 'Latitude': ('lat', 'alias exact match'),
 'Longitude': ('lon', 'alias exact match'),
 'PROVINCE': ('admin2', 'alias exact match'),
 'School Name': ('name', 'fuzzy match 90%'),
 'Settings in School': ('educ_level', 'alias exact match')}
INFO:unicef_schools_attribute_cleaning.pandas.standardize_column_names:adding 33 columns from schema: ['admin0',
 'admin1',
 'admin4',
 'admin_code',
 'admin_id',
 'address',
 'address2',
 'phone_number',
 'person_contact',
 'email',
 'postal_code',
 'altitude',
 'gps_confidence',
 '

In [8]:
df.to_csv('rwanda_cleaned.csv', quoting=QUOTE_NONNUMERIC, index=False)

In [9]:
# open in LibreOffice, Excel, other
!open rwanda_cleaned.csv