<a href="https://colab.research.google.com/github/fernandoGitHub/ML_Projects/blob/main/UNDP_Demographics_Data/UNDP_Demographics_Data_Analysis-Part_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **UNDP_Demographics_Data-Analysis-Part_01**

This notebook will exercise working with several csv files while focusing on cleaning, cleansing and complete missing values using dataframe functionality only

# Setup

In [1]:
!wget https://raw.githubusercontent.com/fernandoGitHub/MLOPS_GSD/main/MLOP_setup.py
import MLOP_setup

MLOP_setup.install_package('WGET')

--2022-05-22 13:29:23--  https://raw.githubusercontent.com/fernandoGitHub/MLOPS_GSD/main/MLOP_setup.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176 (2.1K) [text/plain]
Saving to: ‘MLOP_setup.py’


2022-05-22 13:29:23 (34.3 MB/s) - ‘MLOP_setup.py’ saved [2176/2176]

Installing wget ...
Package wget has been successfully installed
Reloading Packages


In [2]:
import os
import shutil
import wget

import numpy as np
import pandas as pd

import pprint
pp = pprint.PrettyPrinter()

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Data Preparation

In [4]:
_RAW_DATA_DIR = './raw_data/'
_PROCESSED_DATA_DIR = './processed_data'
_ZIP_FILENAME = 'UNDP_Demographics_Data.zip'
_UTILS_FILENAME = 'UNDP_utils.py'

_DATA_REP_URL = 'https://github.com/fernandoGitHub/ML_Projects/raw/main/UNDP_Demographics_Data/data/'
_UTILS_REP_URL = 'https://github.com/fernandoGitHub/ML_Projects/raw/main/UNDP_Demographics_Data/'

_ZIP_FULL_PATH = os.path.join(_RAW_DATA_DIR, _ZIP_FILENAME)
_ZIP_URL_PATH = os.path.join(_DATA_REP_URL, _ZIP_FILENAME)

_UTILS_URL_PATH = os.path.join(_UTILS_REP_URL, _UTILS_FILENAME)
_UTILS_FULL_PATH = os.path.join('.', _UTILS_FILENAME)

if os.path.isdir('./sample_data'):
  shutil.rmtree('./sample_data')

if not os.path.isdir(_RAW_DATA_DIR):
  os.makedirs(_RAW_DATA_DIR)

if not os.path.isdir(_PROCESSED_DATA_DIR):
  os.makedirs(_PROCESSED_DATA_DIR)

if not os.path.isfile(_UTILS_FULL_PATH):
  file_name = wget.download(_UTILS_URL_PATH)
  print(f"Fetching from GitHub: {file_name} ...")

if not os.path.isfile(_ZIP_FULL_PATH):
  file_name = wget.download(_ZIP_URL_PATH)
  print(f"Fetching from GitHub: {file_name} ...")
  os.replace(os.path.join('.', _ZIP_FILENAME), _ZIP_FULL_PATH)

Fetching from GitHub: UNDP_utils.py ...
Fetching from GitHub: UNDP_Demographics_Data.zip ...


In [5]:
import UNDP_utils

In [6]:
from zipfile import ZipFile

with ZipFile(_ZIP_FULL_PATH, 'r') as zip:
  # printing all the contents of the zip file
  zip.printdir()

  # extracting all the files
  zip.extractall(_RAW_DATA_DIR)

File Name                                             Modified             Size
Median_age.csv                                 2022-05-13 15:11:54         9288
Old_age_dependency_ratio.csv                   2022-05-13 15:11:54        14159
Population _ages_65 _and _older.csv            2022-05-13 15:11:54        12651
Population_ages_15_64.csv                      2022-05-13 15:11:54        14059
Population_under_age_5.csv                     2022-05-13 15:11:54        12955
Sex_ratio_at_birth.csv                         2022-05-13 15:11:54         9570
Total_Population.csv                           2022-05-13 15:11:54        15523
Urban_Population.csv                           2022-05-13 15:11:54        16114
Young_age_dependency_ratio.csv                 2022-05-13 15:11:54        15756


# Dataset Preparation

First, let's review the columnns at each dataset

In [7]:
csv_file_list, csv_full_path_list = UNDP_utils.generate_file_list_from_dir(path=_RAW_DATA_DIR, filter = '.csv', display=False)

for file in csv_full_path_list:
  temp_df = pd.read_csv(file)
  pp.pprint (f"csv file: {file} - columns: {temp_df.columns}")

("csv file: ./raw_data/Urban_Population.csv - columns: Index(['HDI Rank', "
 "'Country', '1990', '1995', '2000', '2005', '2010', '2011',\n"
 "       '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019'],\n"
 "      dtype='object')")
("csv file: ./raw_data/Sex_ratio_at_birth.csv - columns: Index(['HDI Rank', "
 "'Country', '1990', '1995', '2000', '2005', '2010', '2015',\n"
 "       '2019'],\n"
 "      dtype='object')")
("csv file: ./raw_data/Total_Population.csv - columns: Index(['HDI Rank', "
 "'Country', '1990', '1995', '2000', '2005', '2010', '2011',\n"
 "       '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', "
 "'2030'],\n"
 "      dtype='object')")
("csv file: ./raw_data/Young_age_dependency_ratio.csv - columns: Index(['HDI "
 "Rank', 'Country', '1990', '1995', '2000', '2005', '2010', '2011',\n"
 "       '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019'],\n"
 "      dtype='object')")
("csv file: ./raw_data/Median_age.csv - columns: Index(['HDI

**Finding:** We can see some irregularities:
   1.- Median and Sex Ratio have less columns (every 5 years)
   2.- Total population has an estimation for 2030

**Next Tasks**
1.   Remove sex-ratio_at_birth from directory (not clear how it helps)
2.   Remove spaces from all CSV files, replace unknown by -1.0 and save a new copy
3.   Interpolate values for median dataframe and save to file
4.   Drop the 2030 column from total population as store it as a label dataframe

In [8]:
# Let's create some util functions

def generate_file_list_from_dir(path, filter = '*.*', display=False):
  """generate_list_from_dir(path, filter = '*.*', display=False) returns two lists of strings.
  The first includes the names of the csv files, the second the full paths. The filter parameter
  can be used to return only a given type of files.
  By default, the function doesn't filter any file"""

  file_list = os.listdir(path=path)

  if filter != '*.*':
    file_list = [file for file in file_list if filter in file]

  full_path_list = [os.path.join(path, file) for file in file_list]

  if display:
    print(file_list)

  return file_list, full_path_list


# Helper Function to remove spaces and replace '..' with -1
def clean_data(df):
  # We need to remove spaces from the dataframe prior to interpolation
  # And also to replace the unknown values (expressed as two consecutive points)
  # to an arbitrary value of -1
  # Finally we will convert the values to float
  for col in df.columns:
    df[col] = df[col].astype(str).str.strip().replace('..','-1')

  return df


# Helper function to set the columns type
def set_column_type(df, type, inclusive=None, exclusive=None):

  cols = df.columns
  if inclusive == None:
    cols = [col for col in cols if not(col in exclusive)]
  else:
    cols = [col for col in cols if (col in inclusive)]

  for col in cols:
    df[col] = df[col].astype(float)

  return df


# Helper function to replace unknown values by mean values for that year
def replace_unknown_by_mean(df):
  for col in df.columns:
    if df[col].dtype == 'float64':
      mean_value = np.round(df[df[col] != -1][col].mean(), 1)
      df[col] = df[col].replace(-1.0, mean_value)
  
  return df


# Helper function to create interpolated columns
def interpolate_columns(df, new_cols):
  
  def interpolate (y1, y2, x1, x2, x):
    return np.round(y1 + (y2-y1)/(x2-x1) * (x-x1), 1)

  for col in new_cols:
    if not (col in df.columns):
      # Adding a new column and creating the interpolation
      year = int(col)
      prev_year = year - year % 5
      next_year = prev_year + 5

      df[col] = np.vectorize(interpolate)(df[str(prev_year)], df[str(next_year)], prev_year, next_year, year)

  return df


def remove_empty_records(df, key):
  for col in df.columns:
    df = df[df[col] != key]

  return df

In [9]:
# 1. Removing sex-ratio-at-birth
os.remove('./raw_data/Sex_ratio_at_birth.csv')

csv_file_list, csv_full_path_list = UNDP_utils.generate_file_list_from_dir(path=_RAW_DATA_DIR, filter = '.csv', display=True)

['Urban_Population.csv', 'Total_Population.csv', 'Young_age_dependency_ratio.csv', 'Median_age.csv', 'Population_ages_15_64.csv', 'Old_age_dependency_ratio.csv', 'Population_under_age_5.csv', 'Population _ages_65 _and _older.csv']


In [10]:
# 2. Remove spaces from all CSV files, replace unknown by column mean values and save a new copy
for file in csv_file_list:
  df = pd.read_csv(os.path.join(_RAW_DATA_DIR, file))
  df = df.drop('HDI Rank', axis = 1)
  df = df.set_index('Country')
  df = clean_data(df)
  df = set_column_type(df, type='float',exclusive=['Country'])
  df = remove_empty_records(df=df, key= -1)

  df.to_csv(os.path.join(_PROCESSED_DATA_DIR, file), index=True)

Let's review the contents of the files again

In [11]:
csv_file_list, csv_full_path_list = UNDP_utils.generate_file_list_from_dir(path=_PROCESSED_DATA_DIR, filter = '.csv', display=False)

In [12]:
for file in csv_full_path_list:
  temp_df = pd.read_csv(file)
  temp_df = temp_df.set_index('Country')
  print(f'DataFrame: {file}')
  temp_df.head(5)
  print ('\n')
  temp_df.describe().transpose()
  print('\n')
  temp_df.info()
  print(100 * '-')
  print('\n\n')

DataFrame: ./processed_data/Urban_Population.csv


Unnamed: 0_level_0,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Afghanistan,21.2,21.6,22.1,22.7,23.7,23.9,24.2,24.4,24.6,24.8,25.0,25.2,25.5,25.8
Albania,36.4,38.9,41.7,46.7,52.2,53.2,54.3,55.4,56.4,57.4,58.4,59.4,60.3,61.2
Algeria,52.1,56.0,59.9,63.8,67.5,68.2,68.9,69.6,70.2,70.8,71.5,72.1,72.6,73.2
Andorra,94.7,93.7,92.4,90.3,88.8,88.7,88.6,88.5,88.4,88.3,88.2,88.2,88.1,88.0
Angola,37.1,44.2,50.1,56.0,59.8,60.5,61.3,62.0,62.7,63.4,64.1,64.8,65.5,66.2






Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1990,195.0,50.499487,24.034581,5.4,29.7,49.4,69.55,100.0
1995,195.0,51.951795,23.803676,7.2,32.0,51.7,71.75,100.0
2000,195.0,53.278974,23.605607,8.2,33.1,53.3,72.85,100.0
2005,195.0,54.845641,23.456233,9.4,35.25,55.6,73.5,100.0
2010,195.0,56.451795,23.439469,10.6,36.2,56.8,74.9,100.0
2011,195.0,56.767692,23.428246,10.9,36.8,56.9,75.4,100.0
2012,195.0,57.076923,23.416248,11.2,37.45,57.1,76.0,100.0
2013,195.0,57.378974,23.400388,11.5,38.05,57.3,76.75,100.0
2014,195.0,57.681026,23.389301,11.8,38.65,57.5,77.05,100.0
2015,195.0,57.992821,23.361005,12.1,39.35,57.7,77.2,100.0




<class 'pandas.core.frame.DataFrame'>
Index: 195 entries,  Afghanistan to  Zimbabwe
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1990    195 non-null    float64
 1   1995    195 non-null    float64
 2   2000    195 non-null    float64
 3   2005    195 non-null    float64
 4   2010    195 non-null    float64
 5   2011    195 non-null    float64
 6   2012    195 non-null    float64
 7   2013    195 non-null    float64
 8   2014    195 non-null    float64
 9   2015    195 non-null    float64
 10  2016    195 non-null    float64
 11  2017    195 non-null    float64
 12  2018    195 non-null    float64
 13  2019    195 non-null    float64
dtypes: float64(14)
memory usage: 22.9+ KB
----------------------------------------------------------------------------------------------------



DataFrame: ./processed_data/Total_Population.csv


Unnamed: 0_level_0,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2030
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Afghanistan,12.4,18.1,20.8,25.7,29.2,30.1,31.2,32.3,33.4,34.4,35.4,36.3,37.2,38.0,48.1
Albania,3.3,3.1,3.1,3.1,2.9,2.9,2.9,2.9,2.9,2.9,2.9,2.9,2.9,2.9,2.8
Algeria,25.8,28.8,31.0,33.1,36.0,36.7,37.4,38.1,38.9,39.7,40.6,41.4,42.2,43.1,50.4
Andorra,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
Angola,11.8,13.9,16.4,19.4,23.4,24.2,25.1,26.0,26.9,27.9,28.8,29.8,30.8,31.8,44.8






Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1990,195.0,27.17641,108.033428,0.0,1.25,5.3,16.3,1176.9
1995,195.0,29.307179,115.769484,0.0,1.35,5.6,18.05,1240.9
2000,195.0,31.349231,122.920234,0.0,1.35,6.2,19.15,1290.6
2005,195.0,33.388718,129.545941,0.0,1.45,6.8,20.95,1330.8
2010,195.0,35.508718,135.971594,0.0,1.85,7.4,23.45,1368.8
2011,195.0,35.945641,137.213638,0.0,1.9,7.7,24.2,1376.5
2012,195.0,36.378974,138.437462,0.0,1.95,7.9,24.85,1384.2
2013,195.0,36.812821,139.64194,0.0,2.0,8.1,25.35,1391.9
2014,195.0,37.249231,140.833481,0.0,2.0,8.2,26.05,1399.5
2015,195.0,37.682051,142.003158,0.0,2.05,8.3,26.75,1406.8




<class 'pandas.core.frame.DataFrame'>
Index: 195 entries,  Afghanistan to  Zimbabwe
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1990    195 non-null    float64
 1   1995    195 non-null    float64
 2   2000    195 non-null    float64
 3   2005    195 non-null    float64
 4   2010    195 non-null    float64
 5   2011    195 non-null    float64
 6   2012    195 non-null    float64
 7   2013    195 non-null    float64
 8   2014    195 non-null    float64
 9   2015    195 non-null    float64
 10  2016    195 non-null    float64
 11  2017    195 non-null    float64
 12  2018    195 non-null    float64
 13  2019    195 non-null    float64
 14  2030    195 non-null    float64
dtypes: float64(15)
memory usage: 24.4+ KB
----------------------------------------------------------------------------------------------------



DataFrame: ./processed_data/Young_age_dependency_ratio.csv


Unnamed: 0_level_0,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Afghanistan,97.7,96.0,100.2,95.9,97.3,95.5,93.2,90.6,87.9,85.2,83.2,81.3,79.3,77.3
Albania,53.1,53.8,48.5,40.8,33.6,31.8,30.3,29.0,28.0,27.2,26.8,26.3,25.8,25.4
Algeria,81.3,70.2,56.0,44.3,40.6,41.0,41.4,42.0,42.8,43.9,44.9,46.2,47.5,48.6
Angola,95.2,95.4,94.0,92.7,93.0,93.3,93.5,93.5,93.4,93.1,92.8,92.4,91.8,91.1
Antigua and Barbuda,50.3,47.4,44.3,39.9,35.6,35.1,34.4,33.7,33.1,32.6,32.4,32.1,31.9,31.8






Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1990,185.0,64.164865,24.091835,23.1,39.4,66.5,86.2,112.4
1995,185.0,61.158378,23.908777,21.9,37.5,62.4,82.3,108.0
2000,185.0,57.132973,23.754497,21.3,33.6,56.0,77.7,105.6
2005,185.0,52.511351,23.77077,19.5,29.4,48.6,71.9,102.7
2010,185.0,48.935676,23.786844,14.9,28.0,43.8,67.5,105.4
2011,185.0,48.516216,23.687819,15.2,27.9,43.3,67.0,106.1
2012,185.0,48.091892,23.54076,15.1,27.9,42.9,66.6,106.4
2013,185.0,47.683243,23.353658,14.9,27.7,43.0,66.2,106.5
2014,185.0,47.305946,23.131823,14.9,27.5,42.5,65.6,106.4
2015,185.0,46.958378,22.881879,15.2,27.3,42.1,64.9,106.1




<class 'pandas.core.frame.DataFrame'>
Index: 185 entries,  Afghanistan to  Zimbabwe
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1990    185 non-null    float64
 1   1995    185 non-null    float64
 2   2000    185 non-null    float64
 3   2005    185 non-null    float64
 4   2010    185 non-null    float64
 5   2011    185 non-null    float64
 6   2012    185 non-null    float64
 7   2013    185 non-null    float64
 8   2014    185 non-null    float64
 9   2015    185 non-null    float64
 10  2016    185 non-null    float64
 11  2017    185 non-null    float64
 12  2018    185 non-null    float64
 13  2019    185 non-null    float64
dtypes: float64(14)
memory usage: 21.7+ KB
----------------------------------------------------------------------------------------------------



DataFrame: ./processed_data/Median_age.csv


Unnamed: 0_level_0,1990,1995,2000,2005,2010,2015,2020
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Afghanistan,15.8,16.0,15.5,16.0,15.9,17.2,18.4
Albania,24.0,24.6,27.0,29.1,32.2,34.9,36.4
Algeria,18.0,19.4,21.7,24.1,26.0,27.5,28.5
Angola,16.1,16.1,16.3,16.4,16.4,16.4,16.7
Antigua and Barbuda,25.2,26.5,28.1,29.5,31.1,32.6,34.0






Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1990,185.0,23.252432,6.91836,14.4,17.5,20.5,29.2,38.4
1995,185.0,24.179459,7.340671,14.9,17.8,21.6,30.8,39.4
2000,185.0,25.215676,7.802273,15.0,18.3,22.7,31.9,41.2
2005,185.0,26.397838,8.163306,15.2,19.2,24.1,34.0,43.0
2010,185.0,27.629189,8.481836,15.0,20.5,26.0,35.5,44.7
2015,185.0,28.808649,8.82018,14.9,20.9,27.6,37.2,46.4
2020,185.0,30.049189,9.089184,15.2,21.8,29.5,38.2,48.4




<class 'pandas.core.frame.DataFrame'>
Index: 185 entries,  Afghanistan to  Zimbabwe
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1990    185 non-null    float64
 1   1995    185 non-null    float64
 2   2000    185 non-null    float64
 3   2005    185 non-null    float64
 4   2010    185 non-null    float64
 5   2015    185 non-null    float64
 6   2020    185 non-null    float64
dtypes: float64(7)
memory usage: 11.6+ KB
----------------------------------------------------------------------------------------------------



DataFrame: ./processed_data/Population_ages_15_64.csv


Unnamed: 0_level_0,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Afghanistan,6.1,9.0,10.1,12.8,14.4,15.0,15.7,16.5,17.3,18.1,18.8,19.5,20.2,20.9
Albania,2.0,1.9,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
Algeria,13.7,16.3,19.0,21.8,24.2,24.6,25.0,25.3,25.7,26.0,26.3,26.6,26.8,27.1
Angola,5.9,7.0,8.2,9.8,11.8,12.2,12.7,13.1,13.6,14.1,14.6,15.1,15.7,16.3
Antigua and Barbuda,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1






Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1990,185.0,17.474054,70.009469,0.0,1.1,3.5,10.3,774.2
1995,185.0,19.066486,76.039882,0.0,1.2,3.5,10.6,829.5
2000,185.0,20.803784,82.431898,0.0,1.4,4.0,12.6,882.7
2005,185.0,22.722703,90.292676,0.1,1.6,4.6,13.6,960.3
2010,185.0,24.491892,96.203904,0.1,1.8,5.2,15.0,1002.9
2011,185.0,24.807027,97.181571,0.1,1.8,5.3,15.2,1008.0
2012,185.0,25.13027,98.168984,0.1,1.9,5.4,15.7,1012.9
2013,185.0,25.443243,99.133403,0.1,1.9,5.5,16.2,1017.2
2014,185.0,25.747568,100.023927,0.1,1.9,5.5,16.3,1020.2
2015,185.0,26.044324,100.806942,0.1,1.9,5.7,16.5,1021.6




<class 'pandas.core.frame.DataFrame'>
Index: 185 entries,  Afghanistan to  Zimbabwe
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1990    185 non-null    float64
 1   1995    185 non-null    float64
 2   2000    185 non-null    float64
 3   2005    185 non-null    float64
 4   2010    185 non-null    float64
 5   2011    185 non-null    float64
 6   2012    185 non-null    float64
 7   2013    185 non-null    float64
 8   2014    185 non-null    float64
 9   2015    185 non-null    float64
 10  2016    185 non-null    float64
 11  2017    185 non-null    float64
 12  2018    185 non-null    float64
 13  2019    185 non-null    float64
dtypes: float64(14)
memory usage: 21.7+ KB
----------------------------------------------------------------------------------------------------



DataFrame: ./processed_data/Old_age_dependency_ratio.csv


Unnamed: 0_level_0,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Afghanistan,4.5,4.8,4.7,4.5,4.7,4.7,4.7,4.7,4.7,4.7,4.7,4.8,4.8,4.8
Albania,8.9,10.6,11.3,13.1,15.9,16.3,16.7,17.2,17.8,18.4,18.9,19.4,20.0,20.8
Algeria,6.3,6.4,7.0,7.5,8.1,8.2,8.3,8.5,8.7,9.0,9.3,9.6,10.0,10.4
Angola,4.9,5.1,5.1,5.0,4.7,4.6,4.6,4.6,4.6,4.5,4.5,4.4,4.3,4.3
Antigua and Barbuda,13.0,11.6,10.2,10.5,10.8,10.9,11.1,11.3,11.5,11.8,12.1,12.4,12.7,13.1






Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1990,185.0,9.818378,5.531126,1.8,6.1,7.4,13.3,27.7
1995,185.0,10.371892,6.018176,1.5,6.1,7.5,14.2,27.5
2000,185.0,10.763243,6.47211,1.5,6.0,8.0,15.5,27.1
2005,185.0,11.163784,7.020609,1.1,5.9,8.0,16.0,29.7
2010,185.0,11.531892,7.572498,0.8,5.9,8.3,16.4,35.1
2011,185.0,11.683243,7.766182,0.9,5.9,8.2,16.7,36.4
2012,185.0,11.855676,7.990782,0.9,5.8,8.3,17.3,38.0
2013,185.0,12.051892,8.2443,1.0,5.8,8.3,17.6,39.6
2014,185.0,12.268108,8.50259,1.1,5.7,8.5,18.0,41.2
2015,185.0,12.5,8.770969,1.1,5.7,8.7,18.4,42.7




<class 'pandas.core.frame.DataFrame'>
Index: 185 entries,  Afghanistan to  Zimbabwe
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1990    185 non-null    float64
 1   1995    185 non-null    float64
 2   2000    185 non-null    float64
 3   2005    185 non-null    float64
 4   2010    185 non-null    float64
 5   2011    185 non-null    float64
 6   2012    185 non-null    float64
 7   2013    185 non-null    float64
 8   2014    185 non-null    float64
 9   2015    185 non-null    float64
 10  2016    185 non-null    float64
 11  2017    185 non-null    float64
 12  2018    185 non-null    float64
 13  2019    185 non-null    float64
dtypes: float64(14)
memory usage: 21.7+ KB
----------------------------------------------------------------------------------------------------



DataFrame: ./processed_data/Population_under_age_5.csv


Unnamed: 0_level_0,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Afghanistan,2.4,3.5,4.2,4.9,5.3,5.4,5.4,5.4,5.5,5.5,5.5,5.6,5.6,5.6
Albania,0.4,0.3,0.3,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2
Algeria,4.0,3.7,3.1,3.0,3.9,4.1,4.3,4.4,4.5,4.7,4.7,4.8,5.0,5.0
Angola,2.3,2.7,3.1,3.7,4.5,4.6,4.8,4.9,5.1,5.2,5.3,5.4,5.6,5.7
Antigua and Barbuda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0






Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1990,185.0,3.465946,13.533677,0.0,0.2,0.7,2.3,133.0
1995,185.0,3.334054,12.134141,0.0,0.2,0.7,2.6,124.8
2000,185.0,3.317838,11.839495,0.0,0.2,0.7,2.7,127.8
2005,185.0,3.380541,11.779729,0.0,0.2,0.7,2.7,129.5
2010,185.0,3.515135,11.819636,0.0,0.2,0.7,2.9,128.2
2011,185.0,3.547568,11.819067,0.0,0.2,0.7,2.9,127.2
2012,185.0,3.574054,11.755345,0.0,0.2,0.7,3.0,125.3
2013,185.0,3.589189,11.651786,0.0,0.2,0.7,3.0,122.9
2014,185.0,3.603784,11.560001,0.0,0.2,0.8,2.9,120.7
2015,185.0,3.617297,11.483633,0.0,0.2,0.8,2.9,119.0




<class 'pandas.core.frame.DataFrame'>
Index: 185 entries,  Afghanistan to  Zimbabwe
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1990    185 non-null    float64
 1   1995    185 non-null    float64
 2   2000    185 non-null    float64
 3   2005    185 non-null    float64
 4   2010    185 non-null    float64
 5   2011    185 non-null    float64
 6   2012    185 non-null    float64
 7   2013    185 non-null    float64
 8   2014    185 non-null    float64
 9   2015    185 non-null    float64
 10  2016    185 non-null    float64
 11  2017    185 non-null    float64
 12  2018    185 non-null    float64
 13  2019    185 non-null    float64
dtypes: float64(14)
memory usage: 21.7+ KB
----------------------------------------------------------------------------------------------------



DataFrame: ./processed_data/Population _ages_65 _and _older.csv


Unnamed: 0_level_0,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Afghanistan,0.3,0.4,0.5,0.6,0.7,0.7,0.7,0.8,0.8,0.9,0.9,0.9,1.0,1.0
Albania,0.2,0.2,0.2,0.3,0.3,0.3,0.3,0.3,0.4,0.4,0.4,0.4,0.4,0.4
Algeria,0.9,1.0,1.3,1.6,2.0,2.0,2.1,2.1,2.2,2.3,2.4,2.6,2.7,2.8
Angola,0.3,0.4,0.4,0.5,0.6,0.6,0.6,0.6,0.6,0.6,0.7,0.7,0.7,0.7
Antigua and Barbuda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0






Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1990,185.0,1.763243,6.202695,0.0,0.1,0.3,1.0,66.3
1995,185.0,2.013514,7.06793,0.0,0.1,0.4,1.1,76.0
2000,185.0,2.264865,8.069225,0.0,0.1,0.4,1.3,87.9
2005,185.0,2.549189,9.141622,0.0,0.1,0.4,1.4,99.6
2010,185.0,2.821622,10.188301,0.0,0.1,0.5,1.6,110.5
2011,185.0,2.895135,10.484073,0.0,0.1,0.5,1.6,113.7
2012,185.0,2.976216,10.787653,0.0,0.1,0.5,1.7,117.0
2013,185.0,3.061622,11.124107,0.0,0.1,0.5,1.8,120.8
2014,185.0,3.155135,11.525265,0.0,0.1,0.5,1.8,125.5
2015,185.0,3.257297,12.001008,0.0,0.1,0.5,1.9,131.3




<class 'pandas.core.frame.DataFrame'>
Index: 185 entries,  Afghanistan to  Zimbabwe
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1990    185 non-null    float64
 1   1995    185 non-null    float64
 2   2000    185 non-null    float64
 3   2005    185 non-null    float64
 4   2010    185 non-null    float64
 5   2011    185 non-null    float64
 6   2012    185 non-null    float64
 7   2013    185 non-null    float64
 8   2014    185 non-null    float64
 9   2015    185 non-null    float64
 10  2016    185 non-null    float64
 11  2017    185 non-null    float64
 12  2018    185 non-null    float64
 13  2019    185 non-null    float64
dtypes: float64(14)
memory usage: 21.7+ KB
----------------------------------------------------------------------------------------------------





In [13]:
# 3. Interpolate values for median_age dataframe and save to file
df = pd.read_csv(os.path.join(_PROCESSED_DATA_DIR, 'Median_age.csv'))
df = df.set_index('Country')

new_cols = ['1990', '1995', '2000', '2005', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
df = interpolate_columns(df, new_cols)
df.to_csv(os.path.join(_PROCESSED_DATA_DIR, 'Median_age.csv'), index=True)

In [14]:
#4 - Drop the 2030 column from total population as store it as a label dataframe
df = pd.read_csv('./raw_data/Total_Population.csv')
df = df.set_index('Country')

y_pop_2030 = df['2030']
df = df.drop('2030', axis=1)

df.to_csv('./raw_data/Total_Population.csv', index=True)
y_pop_2030.to_csv('./processed_data/y_pop_2030.csv', index=True)

In [15]:
for file in csv_full_path_list:
  temp_df = pd.read_csv(file)
  print (f"csv file: {file} - columns: {temp_df.columns}")

csv file: ./processed_data/Urban_Population.csv - columns: Index(['Country', '1990', '1995', '2000', '2005', '2010', '2011', '2012',
       '2013', '2014', '2015', '2016', '2017', '2018', '2019'],
      dtype='object')
csv file: ./processed_data/Total_Population.csv - columns: Index(['Country', '1990', '1995', '2000', '2005', '2010', '2011', '2012',
       '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2030'],
      dtype='object')
csv file: ./processed_data/Young_age_dependency_ratio.csv - columns: Index(['Country', '1990', '1995', '2000', '2005', '2010', '2011', '2012',
       '2013', '2014', '2015', '2016', '2017', '2018', '2019'],
      dtype='object')
csv file: ./processed_data/Median_age.csv - columns: Index(['Country', '1990', '1995', '2000', '2005', '2010', '2015', '2020',
       '2011', '2012', '2013', '2014', '2016', '2017', '2018', '2019'],
      dtype='object')
csv file: ./processed_data/Population_ages_15_64.csv - columns: Index(['Country', '1990', '1995', '2000'

**Next Task**
1.   Merge all the dataframes into one and save into the processed_data directory



In [16]:
# 1. Merge all the dataframes into one and save into the processed_data directory
res_df = None
for file in csv_file_list:
  df = pd.read_csv(os.path.join(_PROCESSED_DATA_DIR, file))
  df = df.set_index('Country')
  cols = df.columns
  new_cols = [col+"-"+os.path.split(file)[1].replace('.csv',"") for col in cols]
  df.columns = new_cols

  if type(res_df) == type(None):
    res_df = df
  else:
    res_df = pd.merge(res_df, df, how='outer', on='Country')

# Dropping partial rows
res_df = res_df.dropna(axis=0)

res_df.to_csv('./processed_data/UNDP_Demographics_Data.csv', index=True)

In [17]:
shutil.make_archive('./UNDP-Processed', 'zip', _PROCESSED_DATA_DIR)

'/content/UNDP-Processed.zip'