In [54]:
import logging
import pandas as pd
from pandas import Series
from unicef_schools_attribute_cleaning.models.School import School
from unicef_schools_attribute_cleaning.utils.series_to_school import series_to_school
from unicef_schools_attribute_cleaning.utils.standardize_column_names import standardize_column_names

src_data_dir='../../data/UNICE_schools_raw_2020_Jun'
fixed_data_dir='../../data/unicef_fixed'

In [59]:
# load devseed source + uuid
df_kenya_liquid_uuid = pd.read_csv(f'{fixed_data_dir}/kenya_liquid_original_uuid.csv', low_memory=False)
# use the uuid as primary key/index
df_kenya_liquid_uuid.set_index('uuid', inplace=True)
df_kenya_liquid_uuid

Unnamed: 0_level_0,Name of School,Province,District,County,Latitude,Longitude,Distance(Km)
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
801dbf2c-c795-4e4d-8e1f-726fbf583c20,SOUTHEND ACADEMY,WESTERN,BUNGOMA,BUNGOMA,0.562360,34.561880,0.000038
0c9a96ac-3c87-4961-b1ee-0f930b4f1a0a,JOY SHIRU PRI SCH,NAIROBI,NAIROBI,NAIROBI,-1.265244,36.750676,0.000144
302367ed-7a50-4069-aa91-7a6a70a779e3,ST EDWARDS HIGH SCH,NAIROBI,STAREHE,,-1.272549,36.822084,0.000158
51baf53c-1aa1-4105-914f-a0cfca56aefb,NAIVASHA PRI BOARDING,RIFT VALLEY,NAKURU,NAKURU,-0.704300,36.435070,0.000195
7d46aabd-4b84-4dc5-85d1-2b4b3aac49ad,MLOLONGO PRI SCH,EASTERN,MACHAKOS,MACHAKOS,-1.388401,36.935328,0.000218
...,...,...,...,...,...,...,...
c84f54bd-6a9a-4e9a-8cf2-c5dd4b372f01,DAVA INTEGRATED PRI SCH,NORTH EASTERN,MANDERA,MANDERA,3.953030,41.860900,401.815377
b65c396f-2777-4ccc-880f-ecc6d699fb57,DAVA SPECIAL UNIT,NORTH EASTERN,MANDERA,MANDERA,3.953030,41.860900,401.815377
e81a8321-55e8-4efd-a955-1e4649c6f3c6,BORDER VIEW ACADEMY PRI SCH,NORTH EASTERN,MANDERA,MANDERA,3.942400,41.867270,402.275520
41111108-34ef-42ae-84df-e21fe2338eff,BURUBURU PRI SCH,NORTH EASTERN,MANDERA,MANDERA,3.947800,41.868850,402.566618


In [60]:
# load devseed fixed
df_kenya_liquid_fixed = pd.read_csv(f'{fixed_data_dir}/kenya_liquid_fixed.csv', low_memory=False)
df_kenya_liquid_fixed

# use the uuid as primary key/index
df_kenya_liquid_fixed.set_index('uuid', inplace=True)
df_kenya_liquid_fixed

Unnamed: 0_level_0,name,admin2,admin3,admin4,lat,lon
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
801dbf2c-c795-4e4d-8e1f-726fbf583c20,Southend academy,WESTERN,BUNGOMA,BUNGOMA,0.562360,34.561880
0c9a96ac-3c87-4961-b1ee-0f930b4f1a0a,Joy shiru pri sch,NAIROBI,NAIROBI,NAIROBI,-1.265244,36.750676
302367ed-7a50-4069-aa91-7a6a70a779e3,St edwards high sch,NAIROBI,STAREHE,,-1.272549,36.822084
51baf53c-1aa1-4105-914f-a0cfca56aefb,Naivasha pri boarding,RIFT VALLEY,NAKURU,NAKURU,-0.704300,36.435070
7d46aabd-4b84-4dc5-85d1-2b4b3aac49ad,Mlolongo pri sch,EASTERN,MACHAKOS,MACHAKOS,-1.388401,36.935328
...,...,...,...,...,...,...
c84f54bd-6a9a-4e9a-8cf2-c5dd4b372f01,Dava integrated pri sch,NORTH EASTERN,MANDERA,MANDERA,3.953030,41.860900
b65c396f-2777-4ccc-880f-ecc6d699fb57,Dava special unit,NORTH EASTERN,MANDERA,MANDERA,3.953030,41.860900
e81a8321-55e8-4efd-a955-1e4649c6f3c6,Border view academy pri sch,NORTH EASTERN,MANDERA,MANDERA,3.942400,41.867270
41111108-34ef-42ae-84df-e21fe2338eff,Buruburu pri sch,NORTH EASTERN,MANDERA,MANDERA,3.947800,41.868850


In [62]:
# join the dataframes and look for which columns changed
joined = df_kenya_liquid_fixed.join(df_kenya_liquid_uuid, lsuffix='_fixed')
joined

Unnamed: 0_level_0,name,admin2,admin3,admin4,lat,lon,Name of School,Province,District,County,Latitude,Longitude,Distance(Km)
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
801dbf2c-c795-4e4d-8e1f-726fbf583c20,Southend academy,WESTERN,BUNGOMA,BUNGOMA,0.562360,34.561880,SOUTHEND ACADEMY,WESTERN,BUNGOMA,BUNGOMA,0.562360,34.561880,0.000038
0c9a96ac-3c87-4961-b1ee-0f930b4f1a0a,Joy shiru pri sch,NAIROBI,NAIROBI,NAIROBI,-1.265244,36.750676,JOY SHIRU PRI SCH,NAIROBI,NAIROBI,NAIROBI,-1.265244,36.750676,0.000144
302367ed-7a50-4069-aa91-7a6a70a779e3,St edwards high sch,NAIROBI,STAREHE,,-1.272549,36.822084,ST EDWARDS HIGH SCH,NAIROBI,STAREHE,,-1.272549,36.822084,0.000158
51baf53c-1aa1-4105-914f-a0cfca56aefb,Naivasha pri boarding,RIFT VALLEY,NAKURU,NAKURU,-0.704300,36.435070,NAIVASHA PRI BOARDING,RIFT VALLEY,NAKURU,NAKURU,-0.704300,36.435070,0.000195
7d46aabd-4b84-4dc5-85d1-2b4b3aac49ad,Mlolongo pri sch,EASTERN,MACHAKOS,MACHAKOS,-1.388401,36.935328,MLOLONGO PRI SCH,EASTERN,MACHAKOS,MACHAKOS,-1.388401,36.935328,0.000218
...,...,...,...,...,...,...,...,...,...,...,...,...,...
c84f54bd-6a9a-4e9a-8cf2-c5dd4b372f01,Dava integrated pri sch,NORTH EASTERN,MANDERA,MANDERA,3.953030,41.860900,DAVA INTEGRATED PRI SCH,NORTH EASTERN,MANDERA,MANDERA,3.953030,41.860900,401.815377
b65c396f-2777-4ccc-880f-ecc6d699fb57,Dava special unit,NORTH EASTERN,MANDERA,MANDERA,3.953030,41.860900,DAVA SPECIAL UNIT,NORTH EASTERN,MANDERA,MANDERA,3.953030,41.860900,401.815377
e81a8321-55e8-4efd-a955-1e4649c6f3c6,Border view academy pri sch,NORTH EASTERN,MANDERA,MANDERA,3.942400,41.867270,BORDER VIEW ACADEMY PRI SCH,NORTH EASTERN,MANDERA,MANDERA,3.942400,41.867270,402.275520
41111108-34ef-42ae-84df-e21fe2338eff,Buruburu pri sch,NORTH EASTERN,MANDERA,MANDERA,3.947800,41.868850,BURUBURU PRI SCH,NORTH EASTERN,MANDERA,MANDERA,3.947800,41.868850,402.566618


In [66]:
changed_name = joined['Name of School'] != joined['name']
changed_name

uuid
801dbf2c-c795-4e4d-8e1f-726fbf583c20    True
0c9a96ac-3c87-4961-b1ee-0f930b4f1a0a    True
302367ed-7a50-4069-aa91-7a6a70a779e3    True
51baf53c-1aa1-4105-914f-a0cfca56aefb    True
7d46aabd-4b84-4dc5-85d1-2b4b3aac49ad    True
                                        ... 
c84f54bd-6a9a-4e9a-8cf2-c5dd4b372f01    True
b65c396f-2777-4ccc-880f-ecc6d699fb57    True
e81a8321-55e8-4efd-a955-1e4649c6f3c6    True
41111108-34ef-42ae-84df-e21fe2338eff    True
2c3dfd34-7471-437f-9276-ebdaf6ae324b    True
Length: 32485, dtype: bool

In [73]:
changed_admin4 = joined['admin4'] != joined['County']
changed_admin4
joined[changed_admin4]

Unnamed: 0_level_0,name,admin2,admin3,admin4,lat,lon,Name of School,Province,District,County,Latitude,Longitude,Distance(Km)
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
302367ed-7a50-4069-aa91-7a6a70a779e3,St edwards high sch,NAIROBI,STAREHE,,-1.272549,36.822084,ST EDWARDS HIGH SCH,NAIROBI,STAREHE,,-1.272549,36.822084,0.000158
b654ffaa-4d9d-4270-b06d-136068d5c233,Mlolongo sec sch,EASTERN,MACHAKOS,,-1.388401,36.935328,MLOLONGO SEC SCH,EASTERN,MACHAKOS,,-1.388401,36.935328,0.000218
a154c134-c745-4300-87c2-af082062e3d8,Babadogo sec sch,NAIROBI,KASARANI,,-1.240739,36.885685,BABADOGO SEC SCH,NAIROBI,KASARANI,,-1.240739,36.885685,0.000306
8ec98299-b6d9-4584-a0bd-5e9529bdc010,Naromoru techn sec,CENTRAL,NYERI,,-0.163380,37.021336,NAROMORU TECHN SEC,CENTRAL,NYERI,,-0.163380,37.021336,0.000459
f3a3f5e4-36ad-4b94-9156-807bc20520cf,Namanga girls high sch,RIFT VALLEY,KAJIADO,,-2.526830,36.825240,NAMANGA GIRLS HIGH SCH,RIFT VALLEY,KAJIADO,,-2.526830,36.825240,0.000676
...,...,...,...,...,...,...,...,...,...,...,...,...,...
c1f7b92c-00d8-454b-a84b-b3041ea30767,Mandera intergrated academy,NORTH EASTERN,MANDERA,,3.924680,41.826960,MANDERA INTERGRATED ACADEMY,NORTH EASTERN,MANDERA,,3.924680,41.826960,397.489907
9703cef8-e3f1-4cf6-aae0-4da96a0ecbfc,Border point sec sch,NORTH EASTERN,MANDERA,,3.940660,41.838250,BORDER POINT SEC SCH,NORTH EASTERN,MANDERA,,3.940660,41.838250,399.070894
5a5fc442-453d-45ff-895e-3bfaab28222f,Moi girls' sec sch,NORTH EASTERN,MANDERA,,3.932490,41.841540,MOI GIRLS' SEC SCH,NORTH EASTERN,MANDERA,,3.932490,41.841540,399.250995
b27c93ce-3be1-41ed-b092-ba2476edc482,Mandera sec sch,NORTH EASTERN,MANDERA,,3.940590,41.842970,MANDERA SEC SCH,NORTH EASTERN,MANDERA,,3.940590,41.842970,399.584313


In [74]:
changed_admin3 = joined['admin3'] != joined['District']
changed_admin3
joined[changed_admin3]

Unnamed: 0_level_0,name,admin2,admin3,admin4,lat,lon,Name of School,Province,District,County,Latitude,Longitude,Distance(Km)
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


In [76]:
changed_admin2 = joined['admin2'] != joined['Province']
changed_admin2
joined[changed_admin2]

Unnamed: 0_level_0,name,admin2,admin3,admin4,lat,lon,Name of School,Province,District,County,Latitude,Longitude,Distance(Km)
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


In [77]:
## -> some fields were changed in addition to the lon, lat. checking with @rub21 about which fields to merge in from the *_fixed files.