In [59]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from matplotlib.lines import Line2D
import plotly.express as px
import requests

GD = pd.read_csv('gender_development.csv', sep=',')
GII = pd.read_csv('gender_inequality.csv', sep=',')
HI = pd.read_csv('historical_index.csv', sep=',')
HD = pd.read_csv('human_development.csv', sep=',')
IA = pd.read_csv('inequality_adjusted.csv', sep=',')
MP = pd.read_csv('multidimensional_poverty.csv', sep=',')



In [22]:
# GD.head()
# GII.head()
# HI.head()
# HD.head()
# IA.head()
# MP.head()

# HI.shape
# GD.shape
# GII.shape

# 1. Files selected are: humand development, historical index, and gender_development
# 2. Computed education deficit
# 3. Merge selected columns to produce 'project.csv'
# 4. Build colors and marker representation for selected columns, 
#    columns are grouped by the number of colors or markers specified
# 5. Questions to answer with the prepared data
#      a. Has Human development and gender development impacted the growth of the 
#         Gross national income GNI over the course of the years? 
#      b. Expected years of education has impacted the human development, True or False


# combine historical index and human development index
HHDI = pd.DataFrame.join(HI,HD.iloc[:,2:])
HHDI = HHDI.replace('..', 0)

# Compute educational deficit as (expected_years_of_education - mean_years_of_education)
edu_deficit = HHDI['Expected Years of Education'] - HHDI['Mean Years of Education']
GD['Gender Development Index (GDI)'] = GD['Gender Development Index (GDI)'].replace('..', 0)

GDED = pd.DataFrame(np.array([edu_deficit,GD['Gender Development Index (GDI)']]).T, 
                           columns=['Years of Education Deficit', 'Gender Development Index (GDI)'],
                          dtype="float")

HHDI = pd.DataFrame.join(HHDI, GDED)
# print(HHDI.head);
# print(edu_deficit['Gender Development Index (GDI)'].max(axis=0))
# print(edu_deficit['Gender Development Index (GDI)'].min(axis=0))

colors = ['red', 'blue', 'green', 'yellow', 'brown', 'orange', 'purple', 'olive', 'gray', 'cyan']
shape = ['o', 's','+','*','x','d','^', '1', '2', 'v']

def groupBy(data, colorOrShape):
    max_data = data.max(axis=0)
    min_data = data.min(axis=0)
    delta = 0
    groupedColorsOrShape = [] #len should be as the len of data 
#     print(max_data)
    try:
        delta = (max_data-min_data)/len(colorOrShape)
        tem = [min_data] 
        t1 =min_data
        while t1<max_data:
            t1 = t1+delta
            tem.append(t1)
#         count = 0; #for debugging
        for i1, d in enumerate(data):
#             print(i1,d)
            for i2, t in enumerate(tem):
                if(d>=t and i2<(len(tem)-1) and d<=tem[i2+1]):
                    groupedColorsOrShape.append(colorOrShape[i2])
#                     print(count,' ',colorOrShape[i2])
#                     count+=1
#         print(len(groupedColorsOrShape))
#         print(len(data))
        return groupedColorsOrShape
    except e:
        print('Invalid data or colorShape is not array')
        
   
    return []

GDI_colors =  groupBy(HHDI['Gender Development Index (GDI)'], colors)
ED_markers = groupBy(HHDI['Years of Education Deficit'], shape)

# for i, d in enumerate(GDI_colors):
#     print(i, ' ', d)
# print(len(GDI_colors))
# print(len(ED_markers))

GDI_ED_colors_markers = pd.DataFrame(np.array([GDI_colors,ED_markers]).T, 
                           columns=['GDI Grouped by Colors', 'YED Grouped by markers'])


HHDI = pd.DataFrame.join(HHDI, GDI_ED_colors_markers)
temp =  pd.DataFrame(np.array([HHDI['Gross National Income (GNI) per Capita'].str.replace(',', '')]).T,
                              columns=['Gross National Income (GNI) per Capita'], 
                              dtype="float")
HHDI['Gross National Income (GNI) per Capita'] = temp['Gross National Income (GNI) per Capita']
#Sort by GNI per capita
HHDI = HHDI.sort_values(by=['Gross National Income (GNI) per Capita'])

min_max_scaler = MinMaxScaler()
LE =  min_max_scaler.fit_transform(np.array(HHDI['Life Expectancy at Birth']).reshape(-1, 1))
LE = pd.Series(LE.reshape(1, -1)[0])
HHDI['Life Expectancy at Birth'] = LE


#Get bounding box, lat and logitute  of countries 
def get_boundingbox_country(country, output_as='boundingbox'):
    """
    get the bounding box of a country in EPSG4326 given a country name

    Parameters
    ----------
    country : str
        name of the country in english and lowercase
    output_as : 'str
        chose from 'boundingbox' or 'center'. 
         - 'boundingbox' for [latmin, latmax, lonmin, lonmax]
         - 'center' for [latcenter, loncenter]

    Returns
    -------
    output : list
        list with coordinates as str
    """
    # create url
    url = '{0}{1}{2}'.format('http://nominatim.openstreetmap.org/search?country=',
                             country,
                             '&format=json&polygon=0')
    response = requests.get(url).json()[0]

    # parse response to list
    if output_as == 'boundingbox':
        lst = response[output_as]
        output = [float(i) for i in lst]
    if output_as == 'center':
        lst = [response.get(key) for key in ['lat','lon']]
        output = [float(i) for i in lst]
    return output



lat=[]
lon=[]
for i, country in enumerate(HHDI['Country']):
#     print()
    latLon = []
    try:
        if(country.find('(')!=-1):
            country = country.split('(')[0]
        if(country.find(',')!=-1):
            country = country.split(',')[0]
    except:
        print('Split error')
                                    
#     print(i, country)
    try:
        latLon = get_boundingbox_country(country=country, output_as="center")
    except:
        print('Error: Into the pacific')
        # if lat/long not found, throw into the pacific ocean
        latLon = [4.050851, -156.422969] #Coordinate in the pacific
    finally:
        lat.append(latLon[0])
        lon.append(latLon[1])

    
print(len(lat))
print(len(lon))
HHDI['Lat'] = lat
HHDI['Lon'] = lon






fig2 = px.scatter(HHDI, x='Human Development Index (1990)', y='Gross National Income (GNI) per Capita', 
               size='Life Expectancy at Birth',   
                  symbol='Gender Development Index (GDI)', 
                  color='Country'
                 )
fig2.show()

fig1 = px.scatter(HHDI, x='Human Development Index (2014)', y='Gross National Income (GNI) per Capita', 
               size='Life Expectancy at Birth',   
                  symbol='Gender Development Index (GDI)', 
                  color='Country'
                 )
fig1.show()


Error: Into the pacific
Error: Into the pacific
Error: Into the pacific
Error: Into the pacific
Error: Into the pacific
Error: Into the pacific
Error: Into the pacific
Error: Into the pacific
Error: Into the pacific
Error: Into the pacific
195
195


In [23]:
HHDI.head()

Unnamed: 0,HDI Rank,Country,Human Development Index (1990),Human Development Index (2000),Human Development Index (2010),Human Development Index (2011),Human Development Index (2012),Human Development Index (2013),Human Development Index (2014),Human Development Index (HDI),...,Expected Years of Education,Mean Years of Education,Gross National Income (GNI) per Capita,GNI per Capita Rank Minus HDI Rank,Years of Education Deficit,Gender Development Index (GDI),GDI Grouped by Colors,YED Grouped by markers,Lat,Lon
186,187.0,Central African Republic,0.314,0.31,0.362,0.368,0.373,0.348,0.35,0.35,...,7.2,4.2,581.0,1.0,3.0,0.773,olive,+,7.03236,19.998123
175,176.0,Congo (Democratic Republic of the),0.355,0.329,0.408,0.418,0.423,0.43,0.433,0.433,...,9.8,6.0,680.0,11.0,3.8,0.833,gray,*,-0.726433,15.641915
172,173.0,Malawi,0.284,0.34,0.42,0.429,0.433,0.439,0.445,0.445,...,10.8,4.3,747.0,13.0,6.5,0.907,gray,^,-13.26872,33.930196
183,184.0,Burundi,0.295,0.301,0.39,0.392,0.395,0.397,0.4,0.4,...,10.1,2.7,758.0,1.0,7.4,0.911,gray,1,-3.4265,29.932395
176,177.0,Liberia,0.0,0.359,0.405,0.414,0.419,0.424,0.43,0.43,...,9.5,4.1,805.0,7.0,5.4,0.789,olive,d,5.749972,-9.365852


In [25]:
fig3 = px.histogram(HHDI, x="Human Development Index (2014)", y="Gross National Income (GNI) per Capita", color="Country", 
                    pattern_shape_sequence= ['.','+','*','x'])
fig3.show()
print(HHDI.shape)

(195, 19)


In [45]:
 
min_max_scaler = MinMaxScaler()
GDI =  min_max_scaler.fit_transform(np.array(HHDI['Gender Development Index (GDI)']).reshape(-1, 1))
GDI = pd.Series(GDI.reshape(1, -1)[0])
HHDI['Gender Development Index (GDI)']=GDI
HHDI['Gender Development Index (GDI)'] = HHDI['Gender Development Index (GDI)'].replace(0, 0.3)
GDI = HHDI['Gender Development Index (GDI)']



fig = px.scatter_geo(HHDI,
#                      locations="iso_alpha",
                     color="Country", # which column to use to set the color of markers
#                      hover_name="country", # column added to hover information
                     symbol="YED Grouped by markers",
                     size="Human Development Index (HDI)", # size of markers
                     lat="Lat",
                     lon="Lon",
                     projection="natural earth",
                     opacity=GDI,
                     size_max=10
                     
                    )
fig.show()


# HHDI.to_csv('project.csv', index=False)

In [159]:
HD_Transpose = HHDI.iloc[:,lambda HD: [2,2]]
countries = HD['Country'];
hdi =  HHDI.iloc[:,lambda HD: [2]].squeeze()
year = ['1990', '2000', '2010', '2011','2012', '2013', '2014' ]
years_series = []

change = 0;
for i in range(len(hdi)*len(year)):
    if((i>0) and (i%len(hdi))==0):
        change+=1  
    if(change>=len(year)):
        break
    years_series.append(year[change])
      
    
    
for i in range(1, 7):
    countries = pd.concat([countries,  HD['Country']],  ignore_index=True)
    hdi = pd.concat([hdi,  HHDI.iloc[:,lambda HD: [i+2]].squeeze() ],  ignore_index=True)

hdi = pd.DataFrame(np.array([countries, hdi, years_series]).T, columns=["Country", 'Human development Index', 'Year'])

# countries.tail()
# print(hdi.tail())
# print(countries.shape)
# len(years_series)
# print(1%len(hdi))
# years_series[1364]
# hdi.shape
# print(hdi)

fig5 = px.line(hdi, x="Year", y="Human development Index", color='Country')
fig5.show()
