In [2]:
import pandas as pd
import fredapi as fa # to import data from Federal Reserve API
import wbgapi as wb # to import data from World Bank API
import plotly.express as px # to chart the data

#### Federal Reserve API

In [268]:
# Downloads data from Federal Reserve API

fred = fa.Fred(api_key="afaf79e2dc5aa8fa23e422406368e64a")

# comparable datasets
real_gdp_capita_US_USD_NSA_A = fred.get_series('NYGDPPCAPKDUSA') # Constant GDP per capita for the United States (2010 U.S. Dollars, Not Seasonally Adjusted)
real_gdp_capita_UK_USD_NSA_A = fred.get_series('NYGDPPCAPKDGBR') # Constant GDP per capita for the United Kingdom (2010 U.S. Dollars, Not Seasonally Adjusted)
nominal_gdp_US_USD_A = fred.get_series('MKTGDPUSA646NWDB') # Gross Domestic Product for United States
nominal_gdp_UK_USD_A = fred.get_series('MKTGDPGBA646NWDB') # Gross Domestic Product for United Kingdom
nominal_gdp_US_NSA_Q = fred.get_series('NA000334Q') # Gross Domestic Product for United States
nominal_gdp_UK_NSA_Q = fred.get_series('CPMNACNSAB1GQUK') # Gross Domestic Product for United Kingdom
CPI_US_YoY_NSA_M = fred.get_series('CPALTT01USM659N') # Consumer Price Index: Total All Items for the United States 
CPI_UK_YoY_NSA_M = fred.get_series('CPALTT01GBM659N') # Consumer Price Index: Total All Items for the United Kingdom
unemp_US_SA_M = fred.get_series('UNRATE') # unemployment rate in the United States
unemp_UK_SA_M = fred.get_series('AURUKM') # Unemployment Rate in the United Kingdom
population_US_NSA_A = fred.get_series('POPTHM') # Population in the United States
population_UK_NSA_A = fred.get_series('POPUKA') # Population in the United Kingdom
gov_debt_to_gdp_US_NSA_A = fred.get_series('GGGDTAUSA188N') # General government gross debt for United States
gov_debt_to_gdp_UK_NSA_A = fred.get_series('GGGDTAGBA188N') # General government gross debt for United Kingdom
house_debt_to_gdp_US_NSA_Q = fred.get_series('HDTGPDUSQ163N') # Household Debt to GDP for United States
house_debt_to_gdp_UK_NSA_Q = fred.get_series('HDTGPDGBQ163N') # Household Debt to GDP for United Kingdom
# non-comparable datasets
real_gdp_US_SA_Q = fred.get_series('GDPC1') # Real Gross Domestic Product for United States (Billions of Chained 2012 Dollars, Seasonally Adjusted Annual Rate)
real_gdp_UK_SA_Q = fred.get_series('NGDPRSAXDCGBQ') # Real Gross Domestic Product for Great Britain (Domestic Currency, Seasonally Adjusted)
ind_prod_US_SA_M = fred.get_series('INDPRO') # Industrial Production: Total Index in the United States (Index 2017=100, Seasonally Adjusted)
ind_prod_UK_SA_M = fred.get_series('GBRPROINDMISMEI') # Production of Total Industry in the United Kingdom (Index 2015=100, Seasonally Adjusted)

# current account balance
# FDI

real_gdp_US_SA_Q.tail()

2021-07-01    19672.594
2021-10-01    20006.181
2022-01-01    19924.088
2022-04-01    19895.271
2022-07-01    20039.406
dtype: float64

In [269]:
ind_prod_US_SA_M.name = 'ind_prod_US_SA_M'
ind_prod_UK_SA_M.name = 'ind_prod_UK_SA_M'

df = pd.merge(ind_prod_US_SA_M, ind_prod_UK_SA_M, how='inner', left_index=True, right_index=True) # merge data for 2 countries into a single DataFrame only for rows for which both countries have data
df.tail()

Unnamed: 0,ind_prod_US_SA_M,ind_prod_UK_SA_M
2022-04-01,104.2709,113.229572
2022-05-01,104.1646,113.65405
2022-06-01,104.0772,113.335692
2022-07-01,104.7577,112.062257
2022-08-01,104.6544,110.045985


#### World Bank API

In [80]:
# Downloads data from World Bank API. 
# Documentation is here: https://github.com/tgherzog/wbgapi


# find the right data to import:

# wb.source.info() # shows names of World Bank databases.
# wb.series.info() # shows all world bank indicators (e.g. 'EG.ELC.ACCS.ZS'). By defaults, it shows all indicators in World Development Indicators (WDI) database.
# wb.series.info(q="GDP") # to search for a specific indicator in World Bank API using a keyword
# wb.economy.info() # shows names and codes of countries and economies and codes for their region & income levels in World Bank databases
# wb.economy.info(q="Azerbaijan") # to search for a specific country/economy in WB API using a keyword
# wb.region.info() # shows names of geographic regions in World Bank databases
# wb.income.info() # shows income groups in World Bank databases
# wb.search('NY.GDP.MKTP.CD') # deeper search on all meta data 
# wb.series.metadata.get('EG.ELC.ACCS.ZS') # search for metadata for a specific indicator in World Bank API
# wb.economy.metadata.get('FRA') # search for metadata for a specific country in World Bank API


# summary of key indicators saved as dictionary:

# mydict = {
# # economic indicators
# 'NY.GDP.MKTP.CD' : 'GDP (current US$)',
# 'NY.GDP.MKTP.KD.ZG' : 'GDP growth (annual %)',
# 'NY.GDP.PCAP.CD' : 'GDP per capita (current US$)',
# 'NY.GDP.PCAP.KD.ZG' : 'GDP per capita growth (annual %)',
# 'FP.CPI.TOTL.ZG' : 'Inflation, consumer prices (annual %)',
# 'SL.UEM.TOTL.ZS' : 'Unemployment, total (% of total labor force)',
# 'GC.DOD.TOTL.GD.ZS' : 'Central government debt, total (% of GDP)',
# 'BX.KLT.DINV.WD.GD.ZS' : 'Foreign direct investment, net inflows (% of GDP)',
# 'BN.GSR.GNFS.CD' : 'Net trade in goods and services (BoP, current US$)',
# 'NE.EXP.GNFS.ZS': 'Exports of goods and services (% of GDP)',
# 'SL.TLF.CACT.ZS' : 'Labor force participation rate, total (% of total population ages 15+)',
# 'SI.DST.50MD' : 'Proportion of people living below 50 percent of median income (%)',
# 'MS.MIL.XPND.GD.ZS' : 'Military expenditure (% of GDP)',
# 'SE.XPD.TOTL.GB.ZS' : 'Government expenditure on education, total (% of government expenditure)',
# # social indicators
# 'SP.POP.TOTL' : 'Population, total',
# 'SP.URB.TOTL.IN.ZS' : 'Urban population (% of total population)',
# 'SP.POP.TOTL.FE.ZS' : 'Population, female (% of total population)',
# 'SL.EMP.SMGT.FE.ZS' : 'Female share of employment in senior and middle management (%)',
# 'SP.DYN.LE00.IN' : 'Life expectancy at birth, total (years)',
# 'VC.IHR.PSRC.P5' : 'Intentional homicides (per 100,000 people)',
# 'HD.HCI.OVRL' : 'Human Capital Index (HCI) (scale 0-1)',
# 'EG.ELC.ACCS.ZS' : 'Access to electricity (% of population)',
# 'IT.NET.USER.ZS' : 'Individuals using the Internet (% of population)',
# 'IT.NET.BBND.P2' : 'Fixed broadband subscriptions (per 100 people)',
# 'SH.STA.SMSS.ZS' : 'People using safely managed sanitation services (% of population)',
# 'SN.ITK.DEFC.ZS' : 'Prevalence of undernourishment (% of population)',
# # environmental  indicators
# 'EN.ATM.CO2E.PC' : 'CO2 emissions (metric tons per capita)',
# 'EG.ELC.RNWX.ZS' : 'Electricity production from renewable sources, excluding hydroelectric (% of total)',
# 'EG.USE.COMM.CL.ZS' : 'Alternative and nuclear energy (% of total energy use)',
# 'EG.USE.ELEC.KH.PC' : 'Electric power consumption (kWh per capita)',
# 'ER.H2O.FWTL.ZS' : 'Annual freshwater withdrawals, total (% of internal resources)',
# 'AG.LND.FRST.ZS' : 'Forest area (% of land area)'
# }


# sample queries to download data via API

#df = wb.data.DataFrame(['NY.GDP.PCAP.CD', 'SP.POP.TOTL'], ('USA', 'AZE', 'WLD'), mrv=30, numericTimeKeys=True)
#df = wb.data.DataFrame(['NY.GDP.PCAP.CD', 'SP.POP.TOTL'], ('USA', 'AZE', 'WLD'), time=range(2010,2020), index='time', numericTimeKeys=True, labels=True)
#df = wb.data.DataFrame(['NY.GDP.PCAP.CD', 'SP.POP.TOTL'], ('USA', 'AZE', 'WLD'), time=2020, labels=True)
#df = wb.data.DataFrame(['NY.GDP.PCAP.CD', 'SP.POP.TOTL'], wb.region.members('EUU', 'AFR'), time=(1999,2020), labels=True)

In [53]:
# Downloads data from World Bank API based on user inputs via prompt window. Then, shows the data either as bar chart (ranked or unranked) or as a line chart (ranked or unranked as well as actual or cumulative percent change).
# Schema for this code logic is here: https://miro.com/app/board/uXjVP6Rlhjg=/

# function for users to select data type by choosing either Snapshot (if user types '1') or Time series (if user types '2'). The function will return 1 or 2.
def data_type_func():
    data_type_dict = {1: 'Snapshot', 2: 'Time series'}
    while True:
        try: # use try+except to force the users to re-enter values if they entered them incorrectly
            data_type_user_choice = int(input('What data type do you want to capture? Enter 1 for Snapshot or 2 for Time series'))
            if data_type_user_choice in (1, 2):
                print('You selected '+ data_type_dict[data_type_user_choice])
                return data_type_user_choice
                break
            else:
                print('You typed the wrong value. Try again.')
        except:
            print('You typed the wrong value. Try again.')

# function for users to enter countries/regions that they want the data. The options will differ depending on whether they selected Country (1) or Region (2). The function will return country/countries or region(s)
def geo_list_func(country_or_region):
    geo_list = []
    x = 0
    if country_or_region == 1:
        geo_msg = 'Enter ISO-3 country code (e.g. USA). Press "q" to finish.'
    elif country_or_region == 2:
        geo_msg = 'Enter geographical code (e.g. EUU). Press "q" to finish.'
    while x == 0:
        country = input(geo_msg)
        if country == 'q':
            x = x + 1
        else:
            geo_list.append(country.upper())
    print('You selected ', geo_list)
    return geo_list

# function for users to select between Country (if user types '1') or Region (if user types '2'). The function will return 1 or 2.
def geo_func():
    geo_dict = {1: 'Country', 2: 'Region'}
    while True:
        try: # use try+except to force the users to re-enter values if they entered them incorrectly
            geo_user_choice = int(input('What geography do you want to capture? Enter 1 for Country or 2 for Region'))
            if geo_user_choice == 1:
                print('You selected '+ geo_dict[geo_user_choice])
                countries = geo_list_func(geo_user_choice) # calls geo_list_func to get user to enter countries, then assigns them to 'countries' list
                return geo_user_choice, countries
                break
            elif geo_user_choice == 2:
                print('You selected '+ geo_dict[geo_user_choice])
                regions = geo_list_func(geo_user_choice) # calls geo_list_func to get user to enter regions, then assigns them to 'regions' list
                return geo_user_choice, regions
                break
            else:
                print('You typed the wrong value. Try again.')
        except:
            print('You typed the wrong value. Try again.')

# function for users to type year(s) depending on their earlier selection between Snapshot and Time series data type. The function will return year(s).
def year_func(data_type_user_choice):
    while True:
        try: # use try+except to force the users to re-enter values if they entered them incorrectly
            if data_type_user_choice == 1: # runs this code if option 1 (i.e. Snapshot) has been previously selected as data type. Outputs year using year_user_choice variable.
                year_user_choice = input('What year do you want to see the data for? For example, 2019.')
                if len(year_user_choice) == 4 and int(year_user_choice) > 0: # checks to ensure that year value is 4 characters long
                    print('You selected ' + year_user_choice)
                    return int(year_user_choice)
                    break
            elif data_type_user_choice == 2: # runs this code if option 2 (i.e. Time series) has been previously selected as data type. Outputs start and end year using start_year_user_choice and end_year_user_choice variables.
                start_year_user_choice = input('What start year do you want to see the data for? For example, 2019.')
                end_year_user_choice = input('What end year do you want to see the data for? For example, 2019.')
                if len(start_year_user_choice) == 4 and len(end_year_user_choice) == 4 and int(start_year_user_choice) < int(end_year_user_choice): # checks to ensure that years are 4-characters long and that start year value is lower than end year value.
                    print('You selected ' + start_year_user_choice)
                    print('You selected ' + end_year_user_choice)
                    year_user_choice = (int(start_year_user_choice), int(end_year_user_choice))
                    return year_user_choice
                    break
            else:
                print('You typed the wrong value. Try again.')
        except:
            print('You typed the wrong value. Try again.')

# function for users to select if they'd like to see unadjusted (i.e. unranked) data (option 1) or ranked data (option 2). For ranked data, the higher the value, the better.
def ranked_or_unranked_func():
    ranked_or_unranked_dict = {1: 'Unranked', 2: 'Ranked'}
    while True:
        try: # use try+except to force the users to re-enter values if they entered them incorrectly
            ranked_or_unranked_user_choice = int(input('What data type do you want to capture? Enter 1 for Unranked or 2 for Ranked. For ranked data, the higher the rank value, the better.'))
            if ranked_or_unranked_user_choice in (1, 2):
                print('You selected '+ ranked_or_unranked_dict[ranked_or_unranked_user_choice])
                return ranked_or_unranked_user_choice
                break
            else:
                print('You typed the wrong value. Try again.')
        except:
            print('You typed the wrong value. Try again.')

# function for users to type the data field(s) and description of those data field(s) that they wanna download. Whether it is a field or several fields depends on users' earlier selection between Snapshot and Time series data type. The function will return data field(s) in a list format and will also return description of data field(s) in a list format.
def data_fields_func(field_or_fields):
    field_list = []
    field_list_desc = []
    x = 0
    if field_or_fields == 1:
        while x == 0:
            fields_msg = 'Enter data field to download data for (e.g. EG.ELC.ACCS.ZS). Press "q" to finish.'
            data_field = input(fields_msg)
            if data_field == 'q':
                x = x + 1
            else:
                field_list.append(data_field.upper())
                data_field_desc = input('Enter description for the data field that you have just entered (e.g. Access to electricity ( % of population)).')
                field_list_desc.append(data_field_desc)
    elif field_or_fields == 2:
        field_msg = 'Enter data field to download data for (e.g. EG.ELC.ACCS.ZS).'
        data_field = input(field_msg)
        field_list.append(data_field.upper())
        data_field_desc = input('Enter description for the data field that you have just entered (e.g. Access to electricity (% of population)).')
        field_list_desc.append(data_field_desc)
    print('You selected ', field_list, data_field_desc)
    return field_list, field_list_desc

# function to select the appropriate chart depending on whether the user previously selected Snapshot (in which case this function will automatically return 'Bar chart') or Time series data type (in which case the user will be given an option to either select 'Line chart' or 'Line chart: percent change'). The function will return 0, 1 or 2.
def chart_type_func(data_type_user_choice):
    chart_type_dict = {0: 'Bar chart', 1: 'Line chart', 2: 'Line chart: percent change'}
    if data_type_user_choice == 1:
        print('You selected '+ chart_type_dict[0])
        return 0 # returns '0' value which represents 'Bar chart'
    elif data_type_user_choice == 2:
        while True:
            try: # use try+except to force the users to re-enter values if they entered them incorrectly
                chart_type_user_choice = int(input("What chart do you want to create? Enter 1 for 'Line chart' or 2 for 'Line chart: percent change'"))
                if chart_type_user_choice == 1:
                    print('You selected '+ chart_type_dict[chart_type_user_choice])
                    return chart_type_user_choice
                    break
                elif chart_type_user_choice == 2:
                    print('You selected '+ chart_type_dict[chart_type_user_choice])
                    return chart_type_user_choice
                    break
                else:
                    print('You typed the wrong value. Try again.')
            except:
                print('You typed the wrong value. Try again.')

def change_sign_func(df): # changes sign of values in column names specified by user within 'df' DataFrame. It's used for ranking data fields where higher value is bad. It's because by default, the higher the rank value, the better. 
    x = 0
    ranked_msg = 'Enter description for the data field where higher value is bad (e.g. Inflation, consumer prices (annual %)). Press "q" to finish.'
    change_sign_field_list = []
    while x == 0:
        change_sign_field = input(ranked_msg)
        if change_sign_field == 'q':
            x = x + 1
        else:
            change_sign_field_list.append(change_sign_field)
    df[change_sign_field_list] = -df[change_sign_field_list]
    return df

def benchmark_df(df): # adds 'Benchmark' to list of countries to show their mean value.
    bench_df = pd.DataFrame(df.mean()).T # creates new 'bench_df' DataFrame to with a row to show mean values of columns in 'df' DataFrame
    bench_df[df.index.name] = 'Benchmark' # adds a column name 'Country' from 'df' DataFrame to 'bench_df' DataFrame and populates it with 'Benchmark' string as value
    bench_df.set_index(df.index.name, inplace=True) # sets 'Country' column as index for 'bench_df' DataFrame
    df = df.append(bench_df) # merges 'df' and 'bench_df' DataFrame into a new 'df' DataFrame
    return df

# function to download data and create a DataFrame to be used later to create a bar chart. It only works if user previously selected Snapshot (option 1) data type via 'data_type_user_choice' variable. The function needs 'data_type_user_choice', 'data_fields_user_choice', 'geo_user_choice' and 'year_user_choice' as input variables. The function returns formatted 'df' DataFrame
def df_func(chart_type_choice, data_fields_user_choice, geo_user_choice, year_user_choice, ranked_or_unranked_user_choice):
    data_fields_dict = {data_fields_user_choice[0][i]: data_fields_user_choice[1][i] for i in range(len(data_fields_user_choice[0]))} # converts list of lists 'data_fields_user_choice' into a dictionary 'data_fields_dict' containing data fields (i.e. indicator codes) and their descriptions
    data_fields_dict = dict(sorted(data_fields_dict.items())) # sorts keys of the dictionary alphabetically. Without this, there will be a bug in 'df = wb.data.DataFrame(list(data_fields_dict.keys())...' line because column names in 'df' DataFrame would be assigned in alphabetical order instead of the order that they were entered in
    if chart_type_choice == 0:
        if geo_user_choice[0] == 1: # downloads data for countries
            df = wb.data.DataFrame(list(data_fields_dict.keys()), geo_user_choice[1], time=year_user_choice, labels=True) # I removed '.dropna()' from the end of the code
        elif geo_user_choice[0] == 2: # downloads data for regions
            # BUG if user entered more than one region in 'geo_user_choice' variable, this code will only will generate an error.
            for region in geo_user_choice[1]: # loops through regions previously inputted by user in 'geo_user_choice' variable
                df = wb.data.DataFrame(list(data_fields_dict.keys()), wb.region.members(region), time=year_user_choice, labels=True) # I removed '.dropna()' from the end of the code
        df.columns = ['Country'] +  list(data_fields_dict.values()) # change columns names from using indicator codes to indicator names
        df.set_index(['Country'], inplace=True) # replaces index that has country codes with index using values from 'Country' column
        if ranked_or_unranked_user_choice == 2: # checks if user selected to rank countries and if so, then transfrorms the 'df' DataFrame to show ranked data, where the higher the rank values, the better.
            df = change_sign_func(df) # runs a function to change sign of values in column names specified by user within 'df' DataFrame. It's used for ranking data fields where higher value is bad. It's because by default, the higher the rank value, the better. 
            df = benchmark_df(df)            
            df = df.rank(method='first') # creates a new 'df' DataFrame to rank the countries for each dataf field. The higher the rank value the better.
        elif ranked_or_unranked_user_choice == 1: # checks if user selected not to rank countries and if so, then adds 'Benchmark' to list of countries in 'df' DataFrame that represents their median value.
            df = benchmark_df(df)
        df.sort_values(by=list(data_fields_dict.values())[0], ascending=False, inplace=True) # sorts values in the first column of 'df' DataFrame so that the countries look in ascending order when we chart this 'df' later.
    elif chart_type_choice == 1 or chart_type_choice == 2: # will download the data wether for line chart. Since only 1 data type is selected for this, the function to change sign ('change_sign_func') has not been used here. If the user previously selected cumulative percentage change using 'chart_type_choice' variable (option 2), then this function performs additional calculations and formatting. This function returns 'df' DataFrame with the data.
        if geo_user_choice[0] == 1: # downloads data for countries
            df = wb.data.DataFrame(list(data_fields_dict.keys()), geo_user_choice[1], time=range(year_user_choice[0], year_user_choice[1]), numericTimeKeys=True, labels=True).dropna()
        elif geo_user_choice[0] == 2: # downloads data for regions
            # BUG if user entered more than one region in 'geo_user_choice' variable, this code will only will generate an error.
            for region in geo_user_choice[1]: # loops through regions previously inputted by user in 'geo_user_choice' variable
                df = wb.data.DataFrame(list(data_fields_dict.keys()), wb.region.members(region), time=range(year_user_choice[0], year_user_choice[1]), numericTimeKeys=True, labels=True).dropna()
        df.set_index(['Country'], inplace=True) # replaces index that has country codes with index using values from 'Country' column
        if ranked_or_unranked_user_choice == 2: # checks if user selected to rank countries (option 2) and if so, then transfrorms the 'df' DataFrame to show ranked data, where the higher the rank values, the better.
            #df = change_sign_func(df) # runs a function to change sign of values in column names specified by user within 'df' DataFrame. It's used for ranking data fields where higher value is bad. It's because by default, the higher the rank value, the better. 
            df = benchmark_df(df)          
            df = df.rank(method='first') # creates a new 'df' DataFrame to rank the countries for each dataf field. The higher the rank value the better.
        elif ranked_or_unranked_user_choice == 1: # checks if user selected not to rank countries (option 1) and if so, then adds 'Benchmark' to list of countries in 'df' DataFrame that represents their median value.
            df = benchmark_df(df)
        if chart_type_choice == 2: # calculates cumulative percent change time series data
            df = df.T.pct_change() # transposes 'df' DataFrame (i.e. swaps columns with rows) and then computes daily returns for each security
            df = df[1:] # deletes first row with NaN values
            df = (1 + df).cumprod() - 1 # calculates cumulative daily returns
            df = df.T # transposes 'df' DataFrame (i.e. swaps columns with rows) back into it's previous state as it was before calculating cumulative returns
        df = df.reset_index() # move 'Country' column from index to a separate column
        df = df.melt(id_vars=['Country'], value_vars=list(df.columns)[1:], var_name='Year', value_name='Values') # move year column names into rows under 'Year' column and move their values under 'Values' column 
    return df

# chart the data
def chart_func(df, chart_type_choice):
    if chart_type_choice == 0: # creates a bar chart (ranked/unranked and grouped). The chart will show either ranked or unranked data, depending on whether the user previously selected ranked 'df' or unranked 'df'
        fig = px.bar(df, 
            x=df.index, 
            y=df.columns, 
            height=500, 
            width=900,
            template='plotly_dark',
            barmode = 'group'
            )  # you can change the look of this chart by removing 'barmode = 'group' from this line code. If you do that, the chart will shows stacked bar chart if user selected ranked 'df' or shows single (instead of grouped) bar chart if user selected unranked 'df'.
        fig.add_vline(df.index.tolist().index('Benchmark'),
            line_width = 3,
            line_dash = 'dash'
            ) # OPTIONAL: adds a vertical dash line for the 'Benchmark' value
        # fig.update_layout(plot_bgcolor = 'white') # OPTIONAL: you can change the chart background colour to white
        fig.show()
    elif chart_type_choice == 1 or chart_type_choice == 2: # creates a line chart (ranked/unranked). The chart will be the same no matter whether user previously picked cumulative or non-cumulative data using 'chart_type_choice' variable
        fig = px.line(df, 
            x=df['Year'],
            y=df['Values'],
            color=df['Country'],
            height=500, 
            width=900,
            template='plotly_dark'
            )  # charts the data
        fig.show()

# call the functions

data_type_user_choice = data_type_func() # runs a function to let users select either Snapshot or Time series data type and outputs variable data_type_user_choice with user selection either as 1 (Snapshot) or 2 (Time series).
year_user_choice = year_func(data_type_user_choice) # runs a function to let users select year (if they previously selected Snapshot data type) or start & end year (if they previuosly selected Time series data type)
geo_user_choice = geo_func() # runs a function geo_func and returns a list of two lists, where the first entry shows whether the user picked country (option 1) or region (option 2) and the second entry is a list of countries/regions that the user has inputted
ranked_or_unranked_user_choice = ranked_or_unranked_func() # runs a function for users to select if they'd like to see unadjusted (i.e. unranked) data (option 1) or ranked data (option 2). For ranked data, the higher the value, the better.
data_fields_user_choice = data_fields_func(data_type_user_choice) # runs a function to let users select several data fields (if they previously selected Snapshot data type) or one data field (if they previuosly selected Time series data type). This function outputs field(s) inputted by user as a list and also outputs field(s) description inputted by user as a list.
chart_type_choice = chart_type_func(data_type_user_choice) # runs a function to let users select a chart type (if they previuosly selected Time series data type) by choosing between 'Line chart' (option 1) or 'Line chart: percent change' (option 2). If the users previously selected Snapshot data type, then this function will automatically select 'Bar chart' (option 0).
df = df_func(chart_type_choice, data_fields_user_choice, geo_user_choice, year_user_choice, ranked_or_unranked_user_choice) # runs a function to download required data based on earlier user choices and country/region inputs and selected data fields. This function formats the data (and ranks it if needed) and outputs data as 'df' DataFrame.
chart = chart_func(df, chart_type_choice) # runs a function to create a chart of 'df' DataFrame depending on the type of data (Snapshot or Timeseries) 

You selected Time series
You selected 1960
You selected 2021
You selected Region
You selected  ['EUU']
You selected Unranked
You selected  ['NY.GDP.MKTP.CD'] gdp
You selected Line chart: percent change


In [None]:
# TODO create geographic maps using this tutorial: https://www.youtube.com/watch?v=Oht6cf-Acl0

#### The code below is not needed. It's just to play around with data.

In [60]:
chart_type_choice = 2 # 1 for bar, 2 & 3 for line chart
#data_fields_user_choice = (['NY.GDP.MKTP.CD', 'NY.GDP.PCAP.CD', 'SL.UEM.TOTL.ZS'],['gdp', 'gdp per cap', 'unemp'])
data_fields_user_choice = (['NY.GDP.MKTP.CD'],['GDP'])
geo_user_choice = [2,['EUU', 'AFR']]
#geo_user_choice = [1,['AZE','GBR','WLD']]
year_user_choice = (1980, 2020)
ranked_or_unranked_user_choice = 1

def change_sign_func(df): # changes sign of values in column names specified by user within 'df' DataFrame. It's used for ranking data fields where higher value is bad. It's because by default, the higher the rank value, the better. 
    x = 0
    ranked_msg = 'Enter description for the data field where higher value is bad (e.g. Inflation, consumer prices (annual %)). Press "q" to finish.'
    change_sign_field_list = []
    while x == 0:
        change_sign_field = input(ranked_msg)
        if change_sign_field == 'q':
            x = x + 1
        else:
            change_sign_field_list.append(change_sign_field)
    df[change_sign_field_list] = -df[change_sign_field_list]
    return df

def benchmark_df(df): # adds 'Benchmark' to list of countries to show their mean value.
    bench_df = pd.DataFrame(df.mean()).T # creates new 'bench_df' DataFrame to with a row to show mean values of columns in 'df' DataFrame
    bench_df[df.index.name] = 'Benchmark' # adds a column name 'Country' from 'df' DataFrame to 'bench_df' DataFrame and populates it with 'Benchmark' string as value
    bench_df.set_index(df.index.name, inplace=True) # sets 'Country' column as index for 'bench_df' DataFrame
    df = df.append(bench_df) # merges 'df' and 'bench_df' DataFrame into a new 'df' DataFrame
    return df

def df_func(chart_type_choice, data_fields_user_choice, geo_user_choice, year_user_choice, ranked_or_unranked_user_choice):
    data_fields_dict = {data_fields_user_choice[0][i]: data_fields_user_choice[1][i] for i in range(len(data_fields_user_choice[0]))} # converts list of lists 'data_fields_user_choice' into a dictionary 'data_fields_dict' containing data fields (i.e. indicator codes) and their descriptions
    data_fields_dict = dict(sorted(data_fields_dict.items())) # sorts keys of the dictionary alphabetically. Without this, there will be a bug in 'df = wb.data.DataFrame(list(data_fields_dict.keys())...' line because column names in 'df' DataFrame would be assigned in alphabetical order instead of the order that they were entered in
    if chart_type_choice == 0:
        if geo_user_choice[0] == 1: # downloads data for countries
            df = wb.data.DataFrame(list(data_fields_dict.keys()), geo_user_choice[1], time=year_user_choice, labels=True) # I removed '.dropna()' from the end of the code
        elif geo_user_choice[0] == 2: # downloads data for regions
            # BUG if user entered more than one region in 'geo_user_choice' variable, this code will only will generate an error.
            for region in geo_user_choice[1]: # loops through regions previously inputted by user in 'geo_user_choice' variable
                df = wb.data.DataFrame(list(data_fields_dict.keys()), wb.region.members(region), time=year_user_choice, labels=True) # I removed '.dropna()' from the end of the code
        df.columns = ['Country'] +  list(data_fields_dict.values()) # change columns names from using indicator codes to indicator names
        df.set_index(['Country'], inplace=True) # replaces index that has country codes with index using values from 'Country' column
        if ranked_or_unranked_user_choice == 2: # checks if user selected to rank countries (option 2) and if so, then transfrorms the 'df' DataFrame to show ranked data, where the higher the rank values, the better.
            df = change_sign_func(df) # runs a function to change sign of values in column names specified by user within 'df' DataFrame. It's used for ranking data fields where higher value is bad. It's because by default, the higher the rank value, the better. 
            df = benchmark_df(df)            
            df = df.rank(method='first') # creates a new 'df' DataFrame to rank the countries for each dataf field. The higher the rank value the better.
        elif ranked_or_unranked_user_choice == 1: # checks if user selected not to rank countries (option 1) and if so, then adds 'Benchmark' to list of countries in 'df' DataFrame that represents their median value.
            df = benchmark_df(df)
        df.sort_values(by=list(data_fields_dict.values())[0], ascending=False, inplace=True) # sorts values in the first column of 'df' DataFrame so that the countries look in ascending order when we chart this 'df' later.
    elif chart_type_choice == 1 or chart_type_choice == 2: # will download the data wether for line chart. Since only 1 data type is selected for this, the function to change sign ('change_sign_func') has not been used here. If the user previously selected cumulative percentage change using 'chart_type_choice' variable (option 2), then this function performs additional calculations and formatting. This function returns 'df' DataFrame with the data.
        if geo_user_choice[0] == 1: # downloads data for countries
            df = wb.data.DataFrame(list(data_fields_dict.keys()), geo_user_choice[1], time=range(year_user_choice[0], year_user_choice[1]), numericTimeKeys=True, labels=True).dropna()
        elif geo_user_choice[0] == 2: # downloads data for regions
            # BUG if user entered more than one region in 'geo_user_choice' variable, this code will only will generate an error.
            for region in geo_user_choice[1]: # loops through regions previously inputted by user in 'geo_user_choice' variable
                df = wb.data.DataFrame(list(data_fields_dict.keys()), wb.region.members(region), time=range(year_user_choice[0], year_user_choice[1]), numericTimeKeys=True, labels=True).dropna()
        df.set_index(['Country'], inplace=True) # replaces index that has country codes with index using values from 'Country' column
        if ranked_or_unranked_user_choice == 2: # checks if user selected to rank countries (option 2) and if so, then transfrorms the 'df' DataFrame to show ranked data, where the higher the rank values, the better.
            #df = change_sign_func(df) # runs a function to change sign of values in column names specified by user within 'df' DataFrame. It's used for ranking data fields where higher value is bad. It's because by default, the higher the rank value, the better. 
            df = benchmark_df(df)          
            df = df.rank(method='first') # creates a new 'df' DataFrame to rank the countries for each dataf field. The higher the rank value the better.
        elif ranked_or_unranked_user_choice == 1: # checks if user selected not to rank countries (option 1) and if so, then adds 'Benchmark' to list of countries in 'df' DataFrame that represents their median value.
            df = benchmark_df(df)
        if chart_type_choice == 2: # calculates cumulative percent change time series data
            df = df.T.pct_change() # transposes 'df' DataFrame (i.e. swaps columns with rows) and then computes daily returns for each security
            df = df[1:] # deletes first row with NaN values
            df = (1 + df).cumprod() - 1 # calculates cumulative daily returns
            df = df.T # transposes 'df' DataFrame (i.e. swaps columns with rows) back into it's previous state as it was before calculating cumulative returns
        df = df.reset_index() # move 'Country' column from index to a separate column
        df = df.melt(id_vars=['Country'], value_vars=list(df.columns)[1:], var_name='Year', value_name='Values') # move year column names into rows under 'Year' column and move their values under 'Values' column 
    # elif chart_type_choice == 2:
        # ...
    return df

# chart the data
def chart_func(df, chart_type_choice):
    if chart_type_choice == 0: # creates a bar chart (ranked/unranked and grouped). The chart will show either ranked or unranked data, depending on whether the user previously selected ranked 'df' or unranked 'df'
        fig = px.bar(df, 
            x=df.index, 
            y=df.columns, 
            barmode = 'group'
            )  # you can change the look of this chart by removing 'barmode = 'group' from this line code. If you do that, the chart will shows stacked bar chart if user selected ranked 'df' or shows single (instead of grouped) bar chart if user selected unranked 'df'.
        fig.add_vline(df.index.tolist().index('Benchmark'),
            line_width = 3,
            line_dash = 'dash'
            ) # OPTIONAL: adds a vertical dash line for the 'Benchmark' value
        # fig.update_layout(plot_bgcolor = 'white') # OPTIONAL: you can change the chart background colour to white
        fig.show()
    elif chart_type_choice == 1 or chart_type_choice == 2: # creates a line chart (ranked/unranked). The chart will be the same no matter whether user previously picked cumulative or non-cumulative data using 'chart_type_choice' variable
        fig = px.line(df, 
            x=df['Year'],
            y=df['Values'],
            color=df['Country'],
            height=500, 
            width=900,
            template='plotly_dark'
            )  # charts the data
        fig.show()


df = df_func(chart_type_choice, data_fields_user_choice, geo_user_choice, year_user_choice, ranked_or_unranked_user_choice) # runs a function to download required data based on earlier user choices and country/region inputs and selected data fields. This function formats the data (and ranks it if needed) and outputs data as 'df' DataFrame.
chart = chart_func(df, chart_type_choice) # runs a function to create a chart of 'df' DataFrame depending on the type of data (Snapshot or Timeseries) 


In [323]:
# chart data as grouped bars

# to have Country as row and all column names under a new 'data_field' column and their values under 'value' column
df_new = df.reset_index() # move 'Country' column from index to a separate column
df_new = df_new.melt(id_vars=['Country'], value_vars=list(df.columns)[0:], var_name='data_field', value_name='value')

# chart shows data as grouped bars
fig = px.bar(df_new, x='Country', y='value', color='data_field', barmode = 'group')
fig.show()

In [228]:
data_fields_dict = {'NY.GDP.MKTP.CD' : 'GDP (current US$)',
                    'NY.GDP.PCAP.CD' : 'GDP per capita (current US$)',
                    'SL.UEM.TOTL.ZS' : 'Unemployment, total (% of total labor force)',
                    'FP.CPI.TOTL.ZG' : 'Inflation, consumer prices (annual %)'
                    }
data_fields_dict = dict(sorted(data_fields_dict.items())) # to sort keys in the dictionary alphabetically. Otherwise, API download wouldn't work properly

#download the data
df = wb.data.DataFrame(list(data_fields_dict.keys()), wb.region.members('EUU'), time=2020, labels=True).dropna()
df.columns = ['Country'] + list(data_fields_dict.values()) # change columns names from using indicator codes to indicator names
df.set_index(['Country'], inplace=True) # replace index that has country codes with index using values from 'Country' column

# reformat the data to rank it (the higher the rank number the better)
df['Inflation, consumer prices (annual %)'] = -df['Inflation, consumer prices (annual %)'] # since higher inflation rate is bad, we need to flip the sign for data under the inflation column. We'll need to use it in benchmarking rankings.
df['Unemployment, total (% of total labor force)'] = -df['Unemployment, total (% of total labor force)'] # since higher unemployment rate is bad, we need to flip the sign for data under the unemployment column. We'll need to use it in benchmarking rankings.
median_df = pd.DataFrame(df.median()).T # create new 'median_df' DataFrame to with a row to show median values of columns in 'df' DataFrame
median_df[df.index.name] = 'Benchmark' # adds a column name 'Country' from 'df' DataFrame to 'median_df' DataFrame and populates it with 'Benchmark' string as value
median_df.set_index(df.index.name, inplace=True) # sets 'Country' column as index for 'median_df' DataFrame
# median_df
# df
benchmark_df = df.append(median_df) # mergers 'df' and 'median_df' DataFrame into a new 'benchmark_df' DataFrame
ranked_df = benchmark_df.rank(method='first') # creates a new 'ranked_df' DataFrame to rank the countries for each dataf field. The higher the rank value the better.
# ranked_df
# comparison_df = pd.concat([ranked_df[ranked_df.index == 'France'], ranked_df[ranked_df.index == 'Benchmark']]) # to compare a country (in this case 'France') with a Benchmark and see them side-by-side
# comparison_df

In [277]:
# download data from World Bank for multiple countries & data fields/indicators but for a single year. Then show it as a bar chart.

countries = ['USA', 'GBR', 'CHN', 'BRA', 'IND', 'ZWE', 'AZE', 'WLD'] # select countries
region = 'EUU' # select region

data_fields_dict = {'EG.ELC.ACCS.ZS' : 'Access to electricity (% of population)', 
                    'NY.GDP.PCAP.CD' : 'GDP per capita (current US$)',
                    'NY.GDP.MKTP.CD' : 'GDP (current US$)',
                    'FP.CPI.TOTL.ZG' : 'Inflation, consumer prices (annual %)',
                    'SP.POP.TOTL.FE.ZS' : 'Population, female (% of total population)'
                    } # create a dictionary of indicator codes and descriptions
data_fields_dict = dict(sorted(data_fields_dict.items())) # to sort keys in the dictionary alphabetically. Otherwise, API download wouldn't work properly

df = wb.data.DataFrame(list(data_fields_dict.keys()), wb.region.members(region), time=2020, labels=True).dropna() # downloads data for all EU countries for specific data fields/indicators for year 2020. Also, drops all blank values
df.columns = ['Country'] +  list(data_fields_dict.values()) # change columns names from using indicator codes to indicator names
df.set_index(['Country'], inplace=True) # replace index that has country codes with index using values from 'Country' column
#df = wb.data.DataFrame(data_fields_list, countries, mrv=30, numericTimeKeys=True)
#df = wb.data.DataFrame(data_fields_list, coutnries, time=range(2010,2020), index='time', numericTimeKeys=True, labels=True)
df

Unnamed: 0_level_0,Access to electricity (% of population),"Inflation, consumer prices (annual %)",GDP (current US$),GDP per capita (current US$),"Population, female (% of total population)"
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Italy,100.0,-0.137708,1892574000000.0,31834.972618,51.311884
Slovak Republic,100.0,1.936941,105172600000.0,19266.513574,51.306725
Portugal,100.0,-0.012438,228539200000.0,22194.566115,52.690275
Denmark,100.0,0.420712,356084900000.0,61063.31643,50.289018
Hungary,100.0,3.326744,156743100000.0,16075.973266,52.4004
Belgium,100.0,0.740792,521676900000.0,45189.3669,50.435502
Austria,100.0,1.381911,433258500000.0,48588.659385,50.705154
Croatia,100.0,0.154811,57203780000.0,14132.486561,51.786534
Greece,100.0,-1.247984,188835200000.0,17647.232688,50.918176
Czechia,100.0,3.161295,245339300000.0,22933.499591,50.765423


In [274]:
# chart the data

fig = px.bar(df, x=df.index, y=df.columns)
fig.show()

In [239]:
# to have Country as row and all column names under a new 'data_field' column and their values under 'value' column
df_new = df.reset_index() # move 'Country' column from index to a separate column
df_new = df_new.melt(id_vars=['Country'], value_vars=list(df.columns)[:], var_name='data_field', value_name='value')
df_new.head()

Unnamed: 0,Country,data_field,value
0,Italy,GDP (current US$),26.0
1,Slovak Republic,GDP (current US$),10.0
2,Portugal,GDP (current US$),13.0
3,Denmark,GDP (current US$),18.0
4,Hungary,GDP (current US$),11.0
