In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, Polygon
import copy
import warnings
warnings.filterwarnings('ignore')

In [2]:
stores_train = pd.read_csv('data/stores_train.csv')

In [3]:
grunnkrets = pd.read_csv('data/grunnkrets_norway_stripped.csv')

In [4]:
grunnkrets_ages = pd.read_csv('data/grunnkrets_age_distribution.csv')

In [5]:
def get_population (stores_data, grunnkrets_data, grunnkrets_ages_data):
    """ Returns population of grunnkrets and district
        Manipulate or remove last line of code according to your taste :)
        May require some more work for imputation of 'grunnkrets_population' using data from 'stores_extra' using 'lat' and 'lon'
        Possibility of using unspervised learning?
        Can be used for training only.Need to see how to make it multipurpose for test data as well.
        Else Separate function is needed.
    """
    #Get data for every grunnkrets_id and and drop duplicates. Prioritize the year '2016'
    grunnkrets = grunnkrets_data.sort_values('year', ascending=False).drop_duplicates('grunnkrets_id').sort_index()
    
    #Create District+Municipality
    grunnkrets_merged_district_municipality_name = grunnkrets
    grunnkrets_merged_district_municipality_name['district_name_pro'] = grunnkrets['district_name'] + ' '+ grunnkrets['municipality_name']
    
    #Drop columns except 'grunnkrets_id' and 'district_municipality_name'
    grunnkrets_with_district_municipality_names = grunnkrets_merged_district_municipality_name
    grunnkrets_with_district_municipality_names = grunnkrets_with_district_municipality_names.drop(grunnkrets_with_district_municipality_names.iloc[:,1:5].columns,axis =1)
    
    grunnkrets_with_district_municipality_names = grunnkrets_with_district_municipality_names.drop('area_km2', axis =1)
    
    #Get data for every grunnkrets_id and and drop duplicates. Prioritize the year '2016'
    grunnkrets_ages_new = grunnkrets_ages_data.sort_values('year', ascending=False).drop_duplicates('grunnkrets_id').sort_index()
    
    #Sum all ages in grunnkrets
    grunnkrets_ages_new['population'] = grunnkrets_ages_new.iloc[:,2:].sum(axis =1)
    
    #Clean: Drop all age columns including year column
    grunnkrets_population = grunnkrets_ages_new
    grunnkrets_population = grunnkrets_population.drop(grunnkrets_population.iloc[:,1:93].columns,axis =1)
    
    grunnkrets_population_dist_muni = grunnkrets_population
    grunnkrets_population_dist_muni = pd.merge(grunnkrets_with_district_municipality_names,grunnkrets_population, how = 'left', on = 'grunnkrets_id')
    
    grunnkrets_dist_muni = grunnkrets_population_dist_muni
    grunnkrets_dist_muni = grunnkrets_dist_muni.drop(['population'], axis = 1)
    
    #Merge only grunnkrets population and find missing population before merging
    merge_grunnkrets_populn_stores_train = pd.merge(stores_data,grunnkrets_dist_muni, how = 'left', on = 'grunnkrets_id')
    
    #Add population of district
    st_train_grunn_pp = pd.merge(merge_grunnkrets_populn_stores_train,grunnkrets_population, how = 'left', on = 'grunnkrets_id')
    
    #Get Population of District
    population_dist_muni = st_train_grunn_pp
    population_dist_muni = population_dist_muni.groupby('district_name_pro')['population'].sum()
    
    #Merge
    st_train_grunn_pp_dist_pp = pd.merge(st_train_grunn_pp,population_dist_muni, how = 'left', on = 'district_name_pro')
    
    st_train_grunn_pp_dist_pp.rename(columns = {'population_x':'grunnkrets_population','population_y': 'district_population'}, inplace = True)
    
    st_train_grunn_pp_dist_pp['geometry'] = gpd.GeoSeries.from_wkt(st_train_grunn_pp_dist_pp['geometry'])

    store_gdf = gpd.GeoDataFrame(st_train_grunn_pp_dist_pp, geometry='geometry')
    store_gdf = store_gdf.drop_duplicates()
    
    for index, row in st_train_grunn_pp_dist_pp.iterrows():
        # print("row", row)
    
        if pd.isnull(row['grunnkrets_population']):
            lat = row['lat']
            lon = row['lon']
            
            store_location = Point(lon, lat)
        
            polygon_indices = store_gdf.distance(store_location).sort_values().index[0:150] #lower values returns missing data for grunnkrets_population
            #cannot guarantee accuracy of imputed missing population
            nearest_grunnkretser = store_gdf.loc[polygon_indices]
            
            st_train_grunn_pp_dist_pp['grunnkrets_population'].loc[index] = np.floor(nearest_grunnkretser['grunnkrets_population'].mean())
        
    for index, row in st_train_grunn_pp_dist_pp.iterrows():        
        if pd.isnull(row['district_population']):
            lat = row['lat']
            lon = row['lon']
            
            store_location = Point(lon, lat)
        
            polygon_indices = store_gdf.distance(store_location).sort_values().index[0:4]
            nearest_grunnkretser = store_gdf.loc[polygon_indices]
            
            st_train_grunn_pp_dist_pp['district_population'].loc[index] = np.floor(nearest_grunnkretser['district_population'].mean())
    
    st_train_grunn_pp_dist_pp = st_train_grunn_pp_dist_pp.drop(st_train_grunn_pp_dist_pp.iloc[:,1:14].columns,axis =1)#Take off this if all colmuns are needed
    return st_train_grunn_pp_dist_pp

In [6]:
#Concactenate population with other generated data
population = get_population (stores_data = stores_train, grunnkrets_data = grunnkrets, grunnkrets_ages_data = grunnkrets_ages)
#population

In [7]:
population

Unnamed: 0,store_id,grunnkrets_population,district_population
0,983540538-974187930-44774,157.0,24980.0
1,987074191-973117734-44755,388.0,48863.0
2,984890265-981157303-64491,372.0,25904.0
3,914057442-992924179-126912,474.0,33261.0
4,913018583-913063538-668469,634.0,2323.0
...,...,...,...
12854,915789943-915806929-781991,1516.0,59309.0
12855,917921733-917982368-868081,503.0,18301.0
12856,911721961-911764474-496764,1117.0,71467.0
12857,914337046-914343372-721294,281.0,14218.0


In [8]:
population.isnull().sum()

store_id                 0
grunnkrets_population    0
district_population      0
dtype: int64