## This notebook extracts data from regions.yaml and writes it to a csv file
make sure regions.yaml is in the current directory

imports

In [1]:
import pandas as pd
import numpy as np
import yaml

define function for data extraction from dictionary tree

In [2]:
def extract_data(region_dict, names=[], kind=[], 
                 population=[], gleam_id=[], 
                 iana=[], iso_alpha_3=[],
                 country_code=[],iso_alpha_3_track=None,
                 lat=[], lon=[], other_names=[]):
    '''
    recurse through nested dictionaries in tree
    to extract data from dictionary 
    '''
    # extract name
    names.append(region_dict['names'][0])
    
    # extract other names
    if len(region_dict['names'])>1:
        for j in range(1,len(region_dict['names'])):
            if j==1:
                other_name_str = region_dict['names'][j]
            else:
                other_name_str = other_name_str + ', ' + region_dict['names'][j]
        other_names.append(other_name_str)
    else:
        other_names.append(None)   
    
        
    # extract kind aka level 
    kind.append(region_dict['kind'])
    
    # if kind is country, keep this iso_alpha_3 code until next country
    if region_dict['kind']=='country':
        if 'iso_alpha_3' in region_dict.keys():
            iso_alpha_3_track = region_dict['iso_alpha_3']
        else:
            iso_alpha_3_track = None
    
    # extract iso_alpha_3 code
    if 'iso_alpha_3' in region_dict.keys():
        iso_alpha_3.append(region_dict['iso_alpha_3'])
    else:
        iso_alpha_3.append(iso_alpha_3_track)
    
    # extract gleam id
    if 'gleam_id' in region_dict.keys():
        gleam_id.append(region_dict['gleam_id'])
    else:
        gleam_id.append(None)
       
    # extract iana code
    if 'iana' in region_dict.keys():   
        iana.append(region_dict['iana'])
    else:
        iana.append(None)
        
    # extract population
    if 'population' in region_dict.keys():
        population.append(region_dict['population'])
    else:
        population.append(None)
     
    # extract latitude
    if 'lat' in region_dict.keys():
        lat.append(region_dict['lat'])
    else:
        lat.append(None)
    
    # extract longitude
    if 'lon' in region_dict.keys():
        lon.append(region_dict['lon'])
    else:
        lon.append(None)
      
    # extract subregions
    if 'subregions' in region_dict.keys():
            for i in range(0,len(region_dict['subregions'])):
                # use recursion to go deeper into tree
                extract_data(region_dict['subregions'][i], names=names, kind=kind, 
                             population=population, gleam_id=gleam_id, 
                             iana=iana, iso_alpha_3=iso_alpha_3,
                             country_code=country_code, iso_alpha_3_track=iso_alpha_3_track,
                             lat=lat, lon=lon, other_names=other_names)
                
    return(names, kind, population, 
           gleam_id, iana, iso_alpha_3, 
           country_code, lat, lon, other_names)

open file and extract data

In [3]:
# open regions.yaml
with open('regions.yaml') as file:
    regions = yaml.safe_load(file)

# extract the data from the dictionary tree
(names, 
kind, 
population, 
gleam_id, 
iana, 
iso_alpha_3, 
country_code, 
lat, 
lon,
other_names) = extract_data(regions)

write to csv file

In [4]:
# create a blank list for formatting table
blank = ['']*len(names)

# write to dataframe first 
df = pd.DataFrame(data={'Code': iana,
                        'Level': kind, 
                        'Name': names,
                        'OfficialName': blank,
                        'OtherNames': other_names,
                        'CountryCode': blank,
                        'CountryCodeISOa3':iso_alpha_3,
                        'M49Code': blank,
                        'Lat':lat, 
                        'Lon':lon,
                        'Population':population,
                        'GleamID':gleam_id})
df = df.sort_values(by=['Level','Name'])
print(df)

# write to csv
df.to_csv('regions_new.csv', index=False)

      Code  Level        Name OfficialName OtherNames CountryCode  \
1672   AAL   city     Aalborg                    None               
1673   AAR   city      Aarhus                    None               
600    ABD   city      Abadan                    None               
1465   ABA   city      Abakan                    None               
3092   YXX   city  Abbotsford                    None               
...    ...    ...         ...          ...        ...         ...   
1186  None  state    xinjiang                    None               
3269  None  state       yukon                    None               
1133  None  state      yunnan                    None               
1146  None  state    zhejiang                    None               
0     None  world       world                   earth               

     CountryCodeISOa3 M49Code     Lat      Lon    Population  GleamID  
1672              DNK          57.093    9.849  1.220000e+05   1565.0  
1673              DNK      