# Populations

In [272]:
import pandas as pd
import numpy as np

from gen_regions.regions import Regions

DATA_FOLDER = "../data/population"
REGIONS_PATH = "../data/regions.csv"
CITIES_15k_PATH = DATA_FOLDER + "/cities15000.csv"
WORLDCITIES_PATH = DATA_FOLDER + "/worldcities.csv"
INDIA500_PATH = DATA_FOLDER + "/india_top500_cities_r2.csv"
STATES_INCL_CHINA_PATH = DATA_FOLDER + "/population_incl_china_states.csv"
US_STATE_PATH = DATA_FOLDER + "/us_state_populations.csv"
WORLD_PATH = DATA_FOLDER + "/WorldPopulation.csv"
UNSD_FM_PATH = DATA_FOLDER + "/unsd-citypopulation-year-fm.csv"
UNSD_BOTH_PATH = DATA_FOLDER + "/unsd-citypopulation-year-both.csv"


def format_df(data, do_print=False):
    data = data.set_index('name')
    data = data['population'].dropna()
    
    if do_print:
        print(data.shape)
        
    # some cities are duplicated (more places on Earth with the same name)
    data = data.groupby('name').max()
    
    if do_print:
        print(data.shape)
    
    return data


def load_regions(filepath):
    data_regions = Regions()
    with open(filepath, 'rt') as f:
        data_regions.read_csv(f)
        
    return data_regions


def load_regions_df(filepath, kind=None):
    data_regions = load_regions(filepath)
    
    if kind is None:
        data_regions = [(x.name, x.pop) for x in data_regions.regions.values()]
    else:
        data_regions = [(x.name, x.pop) for x in data_regions.regions.values() if x.kind == kind]
        
    data_regions = pd.DataFrame(data_regions, columns=["name", "population"])
    
    data_regions = data_regions.set_index("name")
    
    return data_regions


def load_cities15000(filepath):
    data = pd.read_csv(filepath, encoding = "ISO-8859-1")
    data['name'] = data['asciiname'].apply(str.lower)

#     data['name_lower'] = data['name'].apply(str.lower)
    # split alternate names to set of names
#     data['alternatenames'] = data['alternatenames'].dropna().apply(
#         lambda x: {y.lower() for y in x.split(",")}
#     )
    data = format_df(data)
    
    return data

def load_worldcities(filepath):
    data = pd.read_csv(filepath)
    data['name'] = data['city_ascii'].apply(str.lower)
    
    data = format_df(data)
    
    return data

def load_india500(filepath):
    data = pd.read_csv(INDIA500_PATH)
    data['name'] = data['name_of_city'].apply(str.lower)
    data['population'] = data['population_total']
    
    data = format_df(data)

    return data
    
def load_states_incl_china(filepath):
    data = pd.read_csv(filepath)
    
    data["name"] = data["Province/State"].apply(str.lower)
    data["population"] = data["Population"]
    
    data = data[data["name"] != "unknow"]
    
    data = format_df(data)
    
    return data

def load_us_state(filepath):
    data = pd.read_csv(filepath)
    
    data["name"] = data["State"].apply(str.lower)
    data["population"] = data["2018 Population"]
    
    data = format_df(data)
    
    return data

def load_world(filepath):
    data = pd.read_csv(filepath)
    
    data["name"] = data["Country"].apply(str.lower)
    data["population"] = data["2016"]
    
    data = format_df(data)
    
    return data

def load_unsd(filepath):
    data = pd.read_csv(filepath).iloc[:-165]
    
    from unidecode import unidecode
    data["name"] = data["City"].apply(lambda x: unidecode(x).lower())
    
    data["population"] = data["Value"]
    
    data = data.sort_values(by=["name", "Year"])
    
    data = data.drop_duplicates(keep='last', subset=["name"])
    
    data = format_df(data)
    
    return data

# load Regions dataset

# load all population datasets
# do some custom preprocessing such that they can be easily merged
# merge one by one and see how the number of missing cities is decreasing
# write to regions.csv

In [275]:
# use this template to quickly create new loader functions for specific files
def template(filepath):
    data = pd.read_csv(filepath)
    
    data = format_df(data, do_print=True)
    data = format_df(data)
    
    return data

In [121]:
def get_intersection(regions, dataset):
    missing = set(regions[regions['population'].isna()].index)
    new_data = set(dataset.index)
    intersect = missing.intersection(new_data)
    print(len(intersect))
    return intersect

In [189]:
def merge_dataset(regions, dataset):
    print("rows total:\t\t", regions.shape[0])
    print("rows with pop before:\t", regions.dropna().shape[0])
    regions['population'] = regions['population'].combine_first(dataset)
    print("rows with pop after:\t", regions.dropna().shape[0])
    return regions

In [278]:
def update_regions(filepath, df):
    regions = load_regions(filepath)
    
    df = df.dropna()
    
    for reg, row in df.iterrows():
        regions.regions[reg].pop = float(row['population'])
        
    with open(filepath, 'w') as f:
        regions.write_csv(f)

## Update state populations

In [223]:
regions = load_regions_df(REGIONS_PATH, kind="state")

In [224]:
dataset = load_states_incl_china(STATES_INCL_CHINA_PATH)
regions = merge_dataset(regions, dataset)

rows total:		 98
rows with pop before:	 0
rows with pop after:	 40


In [225]:
dataset = load_us_state(US_STATE_PATH)
regions = merge_dataset(regions, dataset)

rows total:		 98
rows with pop before:	 40
rows with pop after:	 86


In [226]:
dataset = load_world(WORLD_PATH)
regions = merge_dataset(regions, dataset)

rows total:		 98
rows with pop before:	 86
rows with pop after:	 86


#### Save results

In [232]:
update_regions(REGIONS_PATH, regions)

## Update city populations

In [233]:
regions = load_regions_df(REGIONS_PATH, kind="city")

In [234]:
dataset = load_worldcities(WORLDCITIES_PATH)
regions = merge_dataset(regions, dataset)

rows total:		 3225
rows with pop before:	 1918
rows with pop after:	 1979


In [235]:
dataset = load_cities15000(CITIES_15k_PATH)
regions = merge_dataset(regions, dataset)

rows total:		 3225
rows with pop before:	 1979
rows with pop after:	 2240


In [236]:
dataset = load_india500(INDIA500_PATH)
regions = merge_dataset(regions, dataset)

rows total:		 3225
rows with pop before:	 2240
rows with pop after:	 2240


In [276]:
dataset = load_unsd(UNSD_BOTH_PATH)
regions = merge_dataset(regions, dataset)

rows total:		 3225
rows with pop before:	 2240
rows with pop after:	 2279


#### Save results

In [279]:
update_regions(REGIONS_PATH, regions)

## Update country populations

In [296]:
regions = load_regions_df(REGIONS_PATH, kind="country")

In [297]:
dataset = load_states_incl_china(STATES_INCL_CHINA_PATH)
regions = merge_dataset(regions, dataset)

rows total:		 230
rows with pop before:	 16
rows with pop after:	 17


In [298]:
dataset = load_world(WORLD_PATH)
regions = merge_dataset(regions, dataset)

rows total:		 230
rows with pop before:	 17
rows with pop after:	 190


#### Save results

In [299]:
update_regions(REGIONS_PATH, regions)