# prepare folders; download csv and tif files; build vrt file

In [4]:
import os
import sys
import wget
import tqdm
import glob
from osgeo import gdal
import subprocess
import pandas as pd
from hdx.utilities.easy_logging import setup_logging
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset
from hdx.data.organization import Organization
from zipfile import ZipFile
import argparse 

In [2]:
#base info
#top_folder='/home/dohyungkim/population'
#ISO='SGP'
#year='2015'

In [20]:
print('---------------------------------------------')
print('Population data process 1/7: data preparation')
print('---------------------------------------------')

---------------------------------------------
Population data process 1/7: data preparation
---------------------------------------------


In [25]:
# Initialize parser 
my_parser = argparse.ArgumentParser(description='initial input')
my_parser.add_argument('top_folder',metavar='top_folder',type=str,help='working folder')
my_parser.add_argument('ISO',metavar='ISO',type=str,help='3 character country iso code')
my_parser.add_argument('year',metavar='year',type=str,help='population year')
args = my_parser.parse_args()
top_folder = args.top_folder
ISO = args.ISO
year = args.year
if not os.path.isdir(top_folder):
    print('The path specified does not exist')
    sys.exit()
#print('\n'.join(os.listdir(top_folder)))

In [29]:
#download file list
#population csv file
pop_file='population_2000_2020'
area_file='px_area_100m'
sub_admin='subnational_admin_2000_2020'

covar_list=('srtm_topo_100m',
            'srtm_slope_100m',
            'viirs_100m'+'_'+year,
            'osm_dst_roadintersec_100m_2016',
            'osm_dst_road_100m_2016',
            'osm_dst_waterway_100m_2016',
            'esaccilc_dst011_100m'+'_'+year,
            'esaccilc_dst040_100m'+'_'+year,
            'esaccilc_dst130_100m'+'_'+year, 
            'esaccilc_dst140_100m'+'_'+year,
            'esaccilc_dst150_100m'+'_'+year,
            'esaccilc_dst160_100m'+'_'+year,
            'esaccilc_dst190_100m'+'_'+year,
            'esaccilc_dst200_100m'+'_'+year,
            'esaccilc_dst_water_100m_2000_2012',
            'dst_coastline_100m_2000_2020')

tif_dirs=('Topo',
          'Slope',
          'VIIRS',
          'OSM/DST',
          'OSM/DST',
          'OSM/DST',
          'ESA_CCI_Annual/'+year,
          'ESA_CCI_Annual/'+year,
          'ESA_CCI_Annual/'+year,
          'ESA_CCI_Annual/'+year,
          'ESA_CCI_Annual/'+year,
          'ESA_CCI_Annual/'+year,
          'ESA_CCI_Annual/'+year,
          'ESA_CCI_Annual/'+year,
          'ESA_CCI_Water/DST',
          'Coastline/DST')

In [30]:
#setting up paths for files
iso_path=os.path.join(top_folder,ISO)
if not os.path.exists(iso_path):
    os.mkdir(iso_path)
 
fb_path=os.path.join(top_folder,ISO,'fb_data')
if not os.path.exists(fb_path):
    os.mkdir(fb_path)
    
wp_path=os.path.join(top_folder,ISO,'wp_data')
if not os.path.exists(wp_path):
    os.mkdir(wp_path)

csv_path=os.path.join(wp_path,'csv')
if not os.path.exists(csv_path):
    os.mkdir(csv_path)

tif_path=os.path.join(wp_path,'tif')
if not os.path.exists(tif_path):
    os.mkdir(tif_path)
    
shp_path=os.path.join(wp_path,'shp')
if not os.path.exists(shp_path):
    os.mkdir(shp_path)

vrt_path=os.path.join(tif_path,'vrt_tiles')
if not os.path.exists(vrt_path):
    os.mkdir(vrt_path)

prd_path=os.path.join(wp_path,'prd_files')
if not os.path.exists(prd_path):
    os.mkdir(prd_path)
    
fb_tile_path=os.path.join(fb_path,'fb_tiles')
if not os.path.exists(fb_tile_path):
    os.mkdir(fb_tile_path)

out_tile_path=os.path.join(fb_path,"out_tiles")    
if not os.path.exists(out_tile_path):
    os.mkdir(out_tile_path)


In [22]:
data={"country_code":[ISO],
      "year":[year],
      "rf_accuracy":[""],
      "n_component":[""],
      "regression_accuracy":[""],
      "total_predicted_pop":[""],
      "total_census_pop":[""]}

df_info=pd.DataFrame(data, columns = ["country_code",
                                      "year",
                                      "rf_accuracy",
                                      "n_component",
                                      "regression_accuracy",
                                      "total_predicted_pop",
                                      "total_census_pop"])

In [23]:
df_info.to_pickle(os.path.join(top_folder,ISO,"df_info.pkl"))

In [31]:
wp_ftp='ftp://ftp.worldpop.org.uk/GIS'

In [32]:
print('downloading input files')

downloading input files


In [33]:
#download census table
link = os.path.join(wp_ftp,'Population/Global_2000_2020/CensusTables/',ISO+'_'+pop_file+'.csv')
if not os.path.exists(os.path.join(wp_path,ISO+'_'+pop_file+'.csv')):
    wget.download(link,wp_path)

In [34]:
#download subnational admin grid
link = os.path.join(wp_ftp,'Mastergrid/Global_2000_2020',ISO,'Subnational',ISO+'_'+sub_admin+'.tif')
if not os.path.exists(os.path.join(tif_path,ISO+'_'+sub_admin+'.tif')):
    wget.download(link,tif_path)

In [35]:
#download subnational shp file
link = os.path.join(wp_ftp,'Mastergrid/Global_2000_2020',ISO,'Subnational/Shapefile',ISO+'_subnational_2000_2020')
if not os.path.exists(os.path.join(shp_path,ISO+'_subnation_2000_2020'+'.shp')):
    wget.download(link+'.cpg',shp_path)
    wget.download(link+'.dbf',shp_path)
    wget.download(link+'.prj',shp_path)
    wget.download(link+'.shp',shp_path)
    wget.download(link+'.shp.xml',shp_path)
    wget.download(link+'.shx',shp_path)

In [36]:
#download covar csv files
for i in range(len(covar_list)):
    link = os.path.join(wp_ftp,'ZonalStatistics/Global_2000_2020',ISO,'mean',ISO+'_'+covar_list[i]+'_ZS_mean.csv')
    if not os.path.exists(os.path.join(csv_path,ISO+'_'+covar_list[i]+'_ZS_mean.csv')):
        wget.download(link,csv_path)

In [37]:
#download area table
link = os.path.join(wp_ftp,'ZonalStatistics/Global_2000_2020',ISO,'sum',ISO+'_'+area_file+'_ZS_sum.csv')
if not os.path.exists(os.path.join(csv_path,ISO+'_'+area_file+'_ZS_sum.csv')):
    wget.download(link,csv_path)

In [38]:
#download tif files
tif_ftp='ftp://ftp.worldpop.org.uk/GIS/Covariates/Global_2000_2020'    

In [39]:
for i in range(len(covar_list)):
    link = os.path.join(tif_ftp,ISO,tif_dirs[i],ISO+'_'+covar_list[i]+'.tif')
    if not os.path.exists(os.path.join(tif_path,ISO+'_'+covar_list[i]+'.tif')):
        wget.download(link,tif_path)

In [40]:
#build vrt file for random forest prediction later
tif_list=list(covar_list)
for i in range(len(covar_list)):
    tif_list[i] =  ISO+'_'+covar_list[i]+'.tif' 

In [41]:
vrt_options = gdal.BuildVRTOptions(separate=True)
os.chdir(tif_path)
vrt=gdal.BuildVRT(ISO+'_covar.vrt', tif_list, options=vrt_options)
vrt.FlushCache()

In [42]:
#download facebook popluation file
#!pip install hdx-python-api

In [26]:
setup_logging()
#if Configuration.values == None:
Configuration.create(hdx_site='prod', user_agent='population_data'+ISO, hdx_read_only=True)

In [44]:
orgs=Organization.read_from_hdx('facebook')

In [45]:
dataset=orgs.get_datasets(ISO)
resources=Dataset.get_all_resources(dataset)

In [46]:
for i in resources:
    #print(i['name'])
    if i['name'].startswith('population_'+ISO.lower()) and i['name'].endswith('geotiff.zip'):
        url, path = i.download(folder=fb_path)
        print('Resource URL %s downloaded to %s' % (url, path))
        

Resource URL http://data.humdata.org/dataset/a9d030f7-c0be-4ecc-b0f1-9c9b30752095/resource/156e8c21-7132-4038-8464-8e903bd9fc07/download/population_sgp_2018-10-01_geotiff.zip downloaded to /home/dohyungkim/population/SGP/fb_data/population_sgp_2018-10-01_geotiff.zip1.zipped geotiff


In [47]:
zf = ZipFile(path, 'r')
zf.extractall(fb_path)
zf.close()

In [48]:
print('downloading input files done!')
print('making tiles from tif images')

downloading input files done!
making tiles from tif images


In [49]:
cmd = 'gdal_retile.py -targetDir '
cmd+= vrt_path
cmd+=' '
cmd+= os.path.join(tif_path,ISO+'_covar.vrt')
proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
stdout,stderr=proc.communicate()

In [50]:
fb_file=glob.glob(os.path.join(fb_path,'*.tif'))[0]

cmd = 'gdal_retile.py -ot float32 -ps 1024 1024 -targetDir '
cmd+= fb_tile_path
cmd+=' '
cmd+= fb_file
proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
stdout,stderr=proc.communicate()