In [25]:
import logging
from os import chdir
from pydantic import ValidationError
import pandas as pd
import numpy as np
from pandas import Series

from unicef_schools_attribute_cleaning.models.School import School
from unicef_schools_attribute_cleaning.utils.standardize_column_names import standardize_column_names
from unicef_schools_attribute_cleaning.utils.none_words import none_words

chdir('/Users/alex/repos/unicef-schools-attribute-cleaning')
df = pd.read_csv('./data/unicef_fixed/zimbabwe_original_uuid.csv', low_memory=False)

# use the uuid as primary key/index
# df.set_index('uuid', inplace=True)
# df

# apply country code, provider to every row
df['country_code'] = 'ZW'
df['owner'] = 'UNICEF'
df['is_private'] = True
df['provider'] = 'unknown'
df['provider_is_private'] = True

standardize_column_names(df, inplace=True)
df



Unnamed: 0,Schoolnumber,name,SchoolLevel,admin2,lat,lon,Tot_comp_Student,Enrolment,_tot_func_computers,type_connectivity,uuid,country_code,owner,is_private,provider,provider_is_private
0,1001,ADMIRAL TAIT,Primary,Harare,-17.82644,31.08971,27.0,2390,35,ADSL,2566e8ff-59dc-4b10-8669-31e890102528,ZW,UNICEF,True,unknown,True
1,1002,ALEXANDRA PARK,Primary,Harare,-17.79152,31.05566,26.0,983,50,,6a4a2d83-e080-4725-88fa-4773ebbd32bc,ZW,UNICEF,True,unknown,True
2,1003,ALFRED BEIT PRIMARY,Primary,Harare,-17.78811,31.00001,6.0,1784,9,ADSL,2418eac4-de3a-426a-90ba-6fb62bfb1d08,ZW,UNICEF,True,unknown,True
3,1004,ARDBENNIE,Primary,Harare,-17.87671,31.03505,25.0,1240,29,ADSL,a586e318-b564-4985-8f58-d1961e9f58f8,ZW,UNICEF,True,unknown,True
4,1005,AVONDALE,Primary,Harare,-17.79174,31.03407,221.0,1708,309,Fibre Optic,4d0f6f3e-f14c-4328-b4c0-ca9ce03ce403,ZW,UNICEF,True,unknown,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9620,1618,SUNNY DAY CHRISTIAN,Primary,Harare,(blank),(blank),12.0,154,14,Fibre Optic,5c335fc2-c8be-42d9-9570-85a78d59e21b,ZW,UNICEF,True,unknown,True
9621,1950,PAGOMO PRIMARY,Primary,Manicaland,(blank),(blank),10.0,62,10,,c02963fa-5587-42b8-aacf-5d7ca490546a,ZW,UNICEF,True,unknown,True
9622,4571,ZIMBABWE NATIONAL DEFENCE COLLEGE,Primary,Mashonaland Central,(blank),(blank),,316,2,Fibre Optic,54ed8226-e1d6-4d5d-8675-6dc2208f1025,ZW,UNICEF,True,unknown,True
9623,14816,WASHINGTON HILLS HIGH SCHOOL,Secondary,Mashonaland West,(blank),(blank),30.0,331,33,Fibre Optic,a291c700-6b39-493a-a3e3-4d6687484b85,ZW,UNICEF,True,unknown,True


In [29]:
# we know the lat,lon columns are numeric and are required, so preprocess with pandas
# this gets rid of values like '(blank)' etc.
df = df[pd.to_numeric(df['lat'], errors='coerce').notnull()]
df = df[pd.to_numeric(df['lon'], errors='coerce').notnull()]
if 'tower_latitide' in df:
    df = df[pd.to_numeric(df['tower_latitude'], errors='coerce')]
if 'tower_longitude' in df:
    df = df[pd.to_numeric(df['tower_longitude'], errors='coerce')]


# connectivity_values = df2['type_connectivity'].unique()
# logging.warning(connectivity_values)


def series_to_school(row: Series) -> School:
    try:
        result = School.parse_obj(row.to_dict())
        return result
    except ValidationError as err:
        logging.warning(row)
        logging.warning(err)
        return False



res = df.apply(func=series_to_school, axis=1)
res

name                                         IMPALI PRIMARY
SchoolLevel                                         Primary
admin2                                             Midlands
lat                                                       0
lon                                                       0
Tot_comp_Student                                         56
Enrolment                                               628
_tot_func_computers                                      74
type_connectivity                               Fibre Optic
uuid                   550f2b18-da8a-49bd-a0b5-5405a2c5e386
country_code                                             ZW
owner                                                UNICEF
is_private                                             True
provider                                            unknown
provider_is_private                                    True
Name: 8523, dtype: object
__root__
  invalid lat,lon of 0,0 for name: IMPALI PRIMARY, uuid: 550f2b18

0       country_code='ZW' admin0=None admin1=None admi...
1       country_code='ZW' admin0=None admin1=None admi...
2       country_code='ZW' admin0=None admin1=None admi...
3       country_code='ZW' admin0=None admin1=None admi...
4       country_code='ZW' admin0=None admin1=None admi...
                              ...                        
8269    country_code='ZW' admin0=None admin1=None admi...
8523                                                False
8739    country_code='ZW' admin0=None admin1=None admi...
8756                                                False
8757                                                False
Length: 7922, dtype: object

In [36]:
# filter out the False values from the series_to_school apply
schools = res[res.values != False]
schools

0       country_code='ZW' admin0=None admin1=None admi...
1       country_code='ZW' admin0=None admin1=None admi...
2       country_code='ZW' admin0=None admin1=None admi...
3       country_code='ZW' admin0=None admin1=None admi...
4       country_code='ZW' admin0=None admin1=None admi...
                              ...                        
8266    country_code='ZW' admin0=None admin1=None admi...
8267    country_code='ZW' admin0=None admin1=None admi...
8268    country_code='ZW' admin0=None admin1=None admi...
8269    country_code='ZW' admin0=None admin1=None admi...
8739    country_code='ZW' admin0=None admin1=None admi...
Length: 7919, dtype: object

In [40]:
# values are validated Pydantic models representing a school
schools.values[0]

School(country_code='ZW', admin0=None, admin1=None, admin2='Harare', admin3=None, admin4=None, admin_code=None, admin_id=None, name='ADMIRAL TAIT', address=None, address2=None, phone_number=None, person_contact=None, email=None, postal_code=None, lon=31.08971, lat=-17.82644, altitude=None, gps_confidence=None, date=None, num_students=None, num_teachers=None, connectivity=None, type_connectivity=<Connectivity.dsl: 'DSL'>, speed_connectivity=None, latency_connectivity=None, availability_connectivity=None, num_computers=None, type_school=None, educ_level=None, environment=None, num_classrooms=None, num_sections=None, water=None, electricity=None, num_latrines=None, provider='unknown', description=None, last_update=None, tower_dist=None, tower_type_service=None, tower_type=None, tower_code=None, tower_latitude=None, tower_longitude=None, owner='UNICEF', is_private=True, provider_is_private=True, uuid=UUID('2566e8ff-59dc-4b10-8669-31e890102528'))