In [21]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

import datetime
from os import listdir
from os.path import isfile, join
import glob
import re

#read data set into a pandas dataframe
df = pd.read_csv('~/Documents/Repository/Capstone-1_WorldBank_GenderData/Data.csv')

In [22]:
df.head(2)

Unnamed: 0,Country.Name,Country.Code,Indicator.Name,Indicator.Code,1960,1961,1962,1963,1964,1965,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Arab World,ARB,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,,,,,,,...,3.585854,5.611509,7.350393,8.645366,10.059527,12.358258,15.225028,17.751386,21.102336,
1,Arab World,ARB,"Access to anti-retroviral drugs, male (%)",SH.HIV.ARTC.MA.ZS,,,,,,,...,4.40783,6.393077,7.52021,9.119101,10.627401,11.652603,13.84155,15.956337,18.406402,


In [23]:
# Melting the individual years columns to be one column with the individual years as rows
id_vars = ["Country.Name",
           "Country.Code",
           "Indicator.Name",
          "Indicator.Code"]

df = pd.melt(frame=df,id_vars=id_vars, var_name="year", value_name="value")

# Formatting 
df["year"] = df["year"].str.extract('(\d+)', expand=False).astype(int)
df["value"] = df["value"].astype(float)

# Cleaning out unnecessary rows with empty values
df = df.dropna()

df = df[["Country.Name",
         "Country.Code",
         "Indicator.Name",
         "Indicator.Code",
         "year",
         "value"]]

df = df.sort_values(ascending=True, by=["Country.Name","Indicator.Name","year"])

df.head(20)

Unnamed: 0,Country.Name,Country.Code,Indicator.Name,Indicator.Code,year,value
7269408,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2000,0.0
7450352,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2001,0.0
7631296,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2002,0.0
7812240,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2003,0.0
7993184,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2004,0.0
8174128,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2005,0.0
8355072,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2006,0.0
8536016,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2007,0.0
8716960,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2008,0.0
8897904,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2009,1.0


In [24]:
#Find out which years have the least # of missing values
df.year.value_counts().sort_values(ascending =False).head(15)

2011    47985
2014    44370
2010    42365
2012    41820
2000    39529
2009    37999
2013    37871
2005    37647
2007    36995
2006    36756
2008    36748
2004    34423
2002    33683
2003    33428
2001    33347
Name: year, dtype: int64

In [25]:
#restrict the data to years between 1999 and 2016
df = df[(df.year > 1999) & (df.year<2016)]
df.head()

Unnamed: 0,Country.Name,Country.Code,Indicator.Name,Indicator.Code,year,value
7269408,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2000,0.0
7450352,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2001,0.0
7631296,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2002,0.0
7812240,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2003,0.0
7993184,Afghanistan,AFG,"Access to anti-retroviral drugs, female (%)",SH.HIV.ARTC.FE.ZS,2004,0.0


In [26]:
# Unmelting/unstacking column "Indictor.Name" to make each 'indicator' variable its own row
df = df.pivot_table(index=["Country.Name","year"], columns="Indicator.Name", values="value")
df.reset_index(drop=False, inplace=True)
df.head(10)

Indicator.Name,Country.Name,year,"Access to anti-retroviral drugs, female (%)","Access to anti-retroviral drugs, male (%)","Account at a financial institution, female (% age 15+) [ts]","Account at a financial institution, male (% age 15+) [ts]","Adolescent fertility rate (births per 1,000 women ages 15-19)","Age at first marriage, female","Age at first marriage, male",Age dependency ratio (% of working-age population),...,Women who own land both alone and jointly (% of women age 15-49): Q3,Women who own land both alone and jointly (% of women age 15-49): Q4,Women who own land both alone and jointly (% of women age 15-49): Q5 (highest),Women who own land jointly (% of women age 15-49),Women who own land jointly (% of women age 15-49): Q1 (lowest),Women who own land jointly (% of women age 15-49): Q2,Women who own land jointly (% of women age 15-49): Q3,Women who own land jointly (% of women age 15-49): Q4,Women who own land jointly (% of women age 15-49): Q5 (highest),Women who were first married by age 18 (% of women ages 20-24)
0,Afghanistan,2000,0.0,0.0,,,153.8456,,,103.254202,...,,,,,,,,,,
1,Afghanistan,2001,0.0,0.0,,,150.0468,,,102.933042,...,,,,,,,,,,
2,Afghanistan,2002,0.0,0.0,,,146.248,,,102.21702,...,,,,,,,,,,
3,Afghanistan,2003,0.0,0.0,,,140.4764,,,101.290161,...,,,,,,,,,,
4,Afghanistan,2004,0.0,0.0,,,134.7048,,,100.247559,...,,,,,,,,,,
5,Afghanistan,2005,0.0,0.0,,,128.9332,,,99.078444,...,,,,,,,,,,
6,Afghanistan,2006,0.0,0.0,,,123.1616,,,99.574274,...,,,,,,,,,,
7,Afghanistan,2007,0.0,0.0,,,117.39,,,100.000371,...,,,,,,,,,,
8,Afghanistan,2008,0.0,0.0,,,111.4708,15.0,25.3,100.215886,...,,,,,,,,,,39.0
9,Afghanistan,2009,1.0,0.0,,,105.5516,,,100.06048,...,,,,,,,,,,


In [27]:
df.plot(x = 'Contraceptive prevalence, any methods (% of women ages 15-49)', y ='Life expectancy at birth, total (years)', kind = 'scatter')

<matplotlib.axes._subplots.AxesSubplot at 0xa9a2c54c>

In [30]:
df.to_csv('wrangled_data.csv')