<a href="https://colab.research.google.com/github/fernandoGitHub/ML_Projects/blob/main/UNDP_Demographics_Data/UNDP_Demographics_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
!wget https://raw.githubusercontent.com/fernandoGitHub/MLOPS_GSD/main/MLOP_setup.py

--2022-05-15 05:37:41--  https://raw.githubusercontent.com/fernandoGitHub/MLOPS_GSD/main/MLOP_setup.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176 (2.1K) [text/plain]
Saving to: ‘MLOP_setup.py’


2022-05-15 05:37:41 (41.4 MB/s) - ‘MLOP_setup.py’ saved [2176/2176]



In [2]:
import MLOP_setup

MLOP_setup.install_package('TF_DATA_VALIDATION')
MLOP_setup.install_package('TF_TRANSFORM')
MLOP_setup.install_package('TFX')

Installing tensorflow-data-validation ...
Package tensorflow-data-validation has been successfully installed
Reloading Packages
Installing tensorflow-transform ...
Package tensorflow-transform has been successfully installed
Reloading Packages
Installing tfx ...
Package tfx has been successfully installed
Reloading Packages


In [3]:
import tensorflow as tf
import tensorflow_data_validation as tfdv

from tfx import v1 as tfx
from tfx.types import standard_artifacts
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from google.protobuf.json_format import MessageToDict
from tensorflow_metadata.proto.v0 import schema_pb2

import os
import pprint
pp = pprint.PrettyPrinter()

import pandas as pd

In [4]:
MLOP_setup.load_and_import_TF_libraries()

import TF_pipeline
import TF_schema

Installing wget ...
Package wget has been successfully installed
Reloading Packages
Fetching from GitHub: TF_pipeline.py ...
Fetching from GitHub: TF_stat.py ...
Fetching from GitHub: TF_transform.py ...
Fetching from GitHub: TF_schema.py ...


# Data Preparation

In [5]:
import os
import shutil

_RAW_DATA_DIR = './raw_data'
_RAW_DATA_ZIP = os.path.join(_RAW_DATA_DIR, 'UNDP_Demographics_Data.zip')

if os.path.isdir('./sample_data'):
  shutil.rmtree('./sample_data')

if not os.path.isdir(_RAW_DATA_DIR):
  os.makedirs(_RAW_DATA_DIR)

if not os.path.isfile(_RAW_DATA_ZIP):
  !wget -O ./raw_data/UNDP_Demographics_Data.zip https://github.com/fernandoGitHub/ML_Projects/raw/main/UNDP_Demographics_Data/data/UNDP_Demographics_Data.zip

--2022-05-15 05:39:24--  https://github.com/fernandoGitHub/ML_Projects/raw/main/UNDP_Demographics_Data/data/UNDP_Demographics_Data.zip
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/fernandoGitHub/ML_Projects/main/UNDP_Demographics_Data/data/UNDP_Demographics_Data.zip [following]
--2022-05-15 05:39:24--  https://raw.githubusercontent.com/fernandoGitHub/ML_Projects/main/UNDP_Demographics_Data/data/UNDP_Demographics_Data.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 44345 (43K) [application/zip]
Saving to: ‘./raw_data/UNDP_Demographics_Data.zip’


2022-05-15 05:39:24 (4.36 MB/s) - ‘./raw_da

In [6]:
from zipfile import ZipFile

zip_file_name = _RAW_DATA_ZIP

with ZipFile(zip_file_name, 'r') as zip:
  # printing all the contents of the zip file
  zip.printdir()

  # extracting all the files
  zip.extractall(_RAW_DATA_DIR)

File Name                                             Modified             Size
Median_age.csv                                 2022-05-13 15:11:54         9288
Old_age_dependency_ratio.csv                   2022-05-13 15:11:54        14159
Population _ages_65 _and _older.csv            2022-05-13 15:11:54        12651
Population_ages_15_64.csv                      2022-05-13 15:11:54        14059
Population_under_age_5.csv                     2022-05-13 15:11:54        12955
Sex_ratio_at_birth.csv                         2022-05-13 15:11:54         9570
Total_Population.csv                           2022-05-13 15:11:54        15523
Urban_Population.csv                           2022-05-13 15:11:54        16114
Young_age_dependency_ratio.csv                 2022-05-13 15:11:54        15756


# Dataset Preparation

In [7]:
csv_list = os.listdir(path=_RAW_DATA_DIR)

csv_list = [file for file in csv_list if '.csv' in file]

In [170]:
def create_dataset_by_year(year):

  first = True
  for csv_file in csv_list:
    csv_path = os.path.join(_RAW_DATA_DIR, csv_file)
    df = pd.read_csv(csv_path)[['Country', year]]
    df = df.rename(columns={year: csv_file.replace('.csv', '')+'_'+year})
    df.set_index('Country')

    if first:
      result_df = df
      result_df.set_index('Country')
      first = False
    else:
      result_df = result_df.reset_index().merge(df, how="outer").set_index('Country')

  result_df = result_df.drop('index', axis=1)

  return result_df

In [171]:
df = create_dataset_by_year('2015')

In [172]:
df


Unnamed: 0_level_0,Population _ages_65 _and _older_2015,Median_age_2015,Total_Population_2015,Sex_ratio_at_birth_2015,Population_under_age_5_2015,Old_age_dependency_ratio_2015,Urban_Population_2015,Population_ages_15_64_2015,Young_age_dependency_ratio_2015
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Afghanistan,0.9,17.2,34.4,1.06,5.5,4.7,24.8,18.1,85.2
Albania,0.4,34.9,2.9,1.1,0.2,18.4,57.4,2,27.2
Algeria,2.3,27.5,39.7,1.05,4.7,9,70.8,26,43.9
Angola,0.6,16.4,27.9,1.03,5.2,4.5,63.4,14.1,93.1
Antigua and Barbuda,0,32.6,0.1,1.03,0,11.8,25.0,0.1,32.6
...,...,...,...,...,...,...,...,...,...
Palau,..,..,0.0,..,..,..,78.2,..,..
Monaco,,,0.0,,,,100.0,,
Nauru,,,0.0,,,,100.0,,
San Marino,,,0.0,,,,96.7,,
