In [1]:
import logging
import pandas as pd

from unicef_schools_attribute_cleaning.utils.standardize_column_names import standardize_column_names
from unicef_schools_attribute_cleaning.models.school_pandas_filter import school_pandas_filter

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

df_src = pd.read_csv('../../data/UNICE_schools_raw_2020_Jun/HN-UNICEF-0-government-0.csv', low_memory=False)
df = pd.read_csv('../../data/unicef_fixed/honduras_original_uuid.csv', low_memory=False)

# apply country code, provider to every row
df['country_code'] = 'HN'
df['is_private'] = True
df['provider'] = 'devseed'
df['provider_is_private'] = True

df

Unnamed: 0,admin2,admin3,school_id,name,educ_level,environment,address,lon,lat,admin4,speed_connectivity,uuid,country_code,is_private,provider,provider_is_private
0,Atlántida,La Ceiba,10100001,ESCUELA GUADALUPE DE QUEZADA,Básica,Urbano,"BARRIO EL CENTRO, AVENIDA SAN ISIDRO",-91.48873,0.00001,,,fc3f885a-42c3-41d7-89a3-0a5749744e02,HN,True,devseed,True
1,Atlántida,La Ceiba,10100002,JOSE CECILIO DEL VALLE,Básica,Urbano,BELLA VISTA,-86.78323,15.77059,,,cd2daa21-3eb6-4c74-ae0d-1bcd2da4160c,HN,True,devseed,True
2,Atlántida,La Ceiba,10100004,AUGUSTO C COELLO,Básica,Urbano,B INGLES,-86.79567,15.78602,LA CEIBA,0.0,854f1c6b-439e-481a-b64b-b13613d2bea0,HN,True,devseed,True
3,Atlántida,La Ceiba,10100005,JOSE TRINIDAD CABAÑAS,Básica,Urbano,LA MERCED,-86.78438,15.77758,LA CEIBA,0.0,339924bb-9877-4355-a924-464473041ed6,HN,True,devseed,True
4,Atlántida,La Ceiba,10100006,GUSTAVO A CASTAÑEDA,Básica,Urbano,B MEJIA,-86.79670,15.77411,LA CEIBA,0.0,5058346b-e49e-4bb4-be97-d3bc17ed2449,HN,True,devseed,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17587,,,80100113,,,,,,,DISTRITO CENTRAL,4.0,6189ce1a-1a58-4daa-a03d-d738aab7e8b2,HN,True,devseed,True
17588,,,80100203,,,,,,,DISTRITO CENTRAL,4.0,2b702549-49b5-467e-9ec7-e75084544016,HN,True,devseed,True
17589,,,150300138,,,,,,,CATACAMAS,4.0,fb643ef4-1c83-440f-b243-5220dbaef926,HN,True,devseed,True
17590,,,150300038,,,,,,,LAS MESETAS,4.0,f8a00e76-70ec-4ed8-ae00-961ce84519aa,HN,True,devseed,True


In [2]:
# standardize columns
standardize_column_names(df, inplace=True)
df

INFO:root:adding 33 columns from schema: ['admin0',
 'admin1',
 'admin_code',
 'admin_id',
 'address2',
 'phone_number',
 'person_contact',
 'email',
 'postal_code',
 'altitude',
 'gps_confidence',
 'date',
 'num_students',
 'num_teachers',
 'connectivity',
 'type_connectivity',
 'latency_connectivity',
 'availability_connectivity',
 'num_computers',
 'type_school',
 'num_classrooms',
 'num_sections',
 'water',
 'electricity',
 'num_latrines',
 'description',
 'last_update',
 'tower_dist',
 'tower_type_service',
 'tower_type',
 'tower_code',
 'tower_latitude',
 'tower_longitude']
INFO:root:removing columns: ['school_id'] (not in School schema)


Unnamed: 0,admin2,admin3,name,educ_level,environment,address,lon,lat,admin4,speed_connectivity,...,electricity,num_latrines,description,last_update,tower_dist,tower_type_service,tower_type,tower_code,tower_latitude,tower_longitude
0,Atlántida,La Ceiba,ESCUELA GUADALUPE DE QUEZADA,Básica,Urbano,"BARRIO EL CENTRO, AVENIDA SAN ISIDRO",-91.48873,0.00001,,,...,,,,,,,,,,
1,Atlántida,La Ceiba,JOSE CECILIO DEL VALLE,Básica,Urbano,BELLA VISTA,-86.78323,15.77059,,,...,,,,,,,,,,
2,Atlántida,La Ceiba,AUGUSTO C COELLO,Básica,Urbano,B INGLES,-86.79567,15.78602,LA CEIBA,0.0,...,,,,,,,,,,
3,Atlántida,La Ceiba,JOSE TRINIDAD CABAÑAS,Básica,Urbano,LA MERCED,-86.78438,15.77758,LA CEIBA,0.0,...,,,,,,,,,,
4,Atlántida,La Ceiba,GUSTAVO A CASTAÑEDA,Básica,Urbano,B MEJIA,-86.79670,15.77411,LA CEIBA,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17587,,,,,,,,,DISTRITO CENTRAL,4.0,...,,,,,,,,,,
17588,,,,,,,,,DISTRITO CENTRAL,4.0,...,,,,,,,,,,
17589,,,,,,,,,CATACAMAS,4.0,...,,,,,,,,,,
17590,,,,,,,,,LAS MESETAS,4.0,...,,,,,,,,,,


In [None]:
# optional speedup: we know the lat,lon columns are numeric AND are required, so preprocess with pandas
# this also suppresses many validation warnings about lat/long

# this gets rid of rows with non-numeric lat/lon '(blank)' etc.
df = df[pd.to_numeric(df['lat'], errors='coerce').notnull()]
df = df[pd.to_numeric(df['lon'], errors='coerce').notnull()]
df

In [None]:
# apply the School validator
schools = df.apply(func=school_pandas_filter, axis=1)
schools

speed_connectivity
  ensure this value is greater than or equal to 0 (type=value_error.number.not_ge; limit_value=0)
environment
  value is not a valid dict (type=type_error.dict)
speed_connectivity
  ensure this value is greater than or equal to 0 (type=value_error.number.not_ge; limit_value=0)
environment
  value is not a valid dict (type=type_error.dict)
environment
  value is not a valid dict (type=type_error.dict)
environment
  value is not a valid dict (type=type_error.dict)
environment
  value is not a valid dict (type=type_error.dict)
environment
  value is not a valid dict (type=type_error.dict)
speed_connectivity
  ensure this value is greater than or equal to 0 (type=value_error.number.not_ge; limit_value=0)
environment
  value is not a valid dict (type=type_error.dict)
speed_connectivity
  ensure this value is greater than or equal to 0 (type=value_error.number.not_ge; limit_value=0)
environment
  value is not a valid dict (type=type_error.dict)
environment
  value is not a

In [18]:
# filter out the None values from the schools_pandas_filter apply
schools = schools[schools['uuid'].notnull()]
schools

Unnamed: 0,country_code,admin0,admin1,admin2,admin3,admin4,admin_code,admin_id,name,address,...,description,last_update,tower_dist,tower_type_service,tower_type,tower_code,tower_latitude,tower_longitude,is_private,uuid
0,ZW,,,Harare,,,,,ADMIRAL TAIT,,...,1001,,,,,,,,True,2566e8ff-59dc-4b10-8669-31e890102528
1,ZW,,,Harare,,,,,ALEXANDRA PARK,,...,1002,,,,,,,,True,6a4a2d83-e080-4725-88fa-4773ebbd32bc
2,ZW,,,Harare,,,,,ALFRED BEIT PRIMARY,,...,1003,,,,,,,,True,2418eac4-de3a-426a-90ba-6fb62bfb1d08
3,ZW,,,Harare,,,,,ARDBENNIE,,...,1004,,,,,,,,True,a586e318-b564-4985-8f58-d1961e9f58f8
4,ZW,,,Harare,,,,,AVONDALE,,...,1005,,,,,,,,True,4d0f6f3e-f14c-4328-b4c0-ca9ce03ce403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8266,ZW,,,Midlands,,,,,ORTONS DRIFT,,...,17949,,,,,,,,True,52e143ea-978e-4fe2-ab57-c526c1f8b5a3
8267,ZW,,,Midlands,,,,,UTAH,,...,17951,,,,,,,,True,986060f3-1fab-4cda-bb14-304b82ba421a
8268,ZW,,,Midlands,,,,,MAYWOOD,,...,17981,,,,,,,,True,d58c5813-2739-4249-8663-c20a0c8b01e9
8269,ZW,,,Midlands,,,,,MELROSE,,...,17982,,,,,,,,True,5faa1276-d78c-4273-b332-98a9bc057657


In [19]:
schools[['name', 'admin2', 'lat', 'lon', 'num_students', 'type_connectivity', 'educ_level']]

Unnamed: 0,name,admin2,lat,lon,num_students,type_connectivity,educ_level
0,ADMIRAL TAIT,Harare,-17.82644,31.08971,2390.0,DSL,Primary
1,ALEXANDRA PARK,Harare,-17.79152,31.05566,983.0,,Primary
2,ALFRED BEIT PRIMARY,Harare,-17.78811,31.00001,1784.0,DSL,Primary
3,ARDBENNIE,Harare,-17.87671,31.03505,1240.0,DSL,Primary
4,AVONDALE,Harare,-17.79174,31.03407,1708.0,Fiber,Primary
...,...,...,...,...,...,...,...
8266,ORTONS DRIFT,Midlands,-19.14955,30.65332,332.0,,Primary
8267,UTAH,Midlands,-19.21671,30.30106,250.0,,Primary
8268,MAYWOOD,Midlands,-18.84981,29.66975,34.0,,Primary
8269,MELROSE,Midlands,-19.13653,29.73772,156.0,,Primary
