# Country Data:  
Basically a simple project I worked on to practice cleaning data and learning how to merge it

In [2]:
import pandas as pd
import numpy as np

In [1]:
def energyData():
    '''
    Returns a daaframe with data about GDP, energy, power rankings of country

    Energy Indicators.xls: data of countries' energy output and related data
    worldBank.csv: data of nations' GDP and finacial related data
    scimagojr-3.xlsx: data of nations power ranks in energy and manufacturing
    '''

    #Obtaining 'Energy' file and turning it into a clean dataset
    Energy = pd.read_excel('assets/Energy Indicators.xls')
    Energy = Energy.drop(['Unnamed: 0', 'Unnamed: 1'], axis = 1)
    Energy.columns = ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']
    Energy = Energy.dropna()[2:]
    Energy['Energy Supply'] = Energy['Energy Supply'].apply(lambda x: x*1000000)
    
    Energy['Country'].replace('(\([\w ]*\))','', regex = True, inplace = True)
    Energy['Country'].replace('([0-9]+)', '', regex = True, inplace = True)
    Energy.replace('[.]+', np.NaN, regex = True, inplace = True)
    Energy['Country'].replace('\s$', '', regex = True, inplace = True)

    Energy = Energy.set_index('Country')
    Energy.rename(index = {"Republic of Korea": "South Korea",
                                    "United States of America": "United States",
                                    "United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
                                    "China, Hong Kong Special Administrative Region": "Hong Kong"}, inplace = True)

    
    #Obtaining 'GDP' file and turning it into a clean dataframe
    GDP = pd.read_csv('assets/world_bank.csv')[4:]
    GDP.replace({"Korea, Rep.": "South Korea", "Iran, Islamic Rep.": "Iran","Hong Kong SAR, China": "Hong Kong"}, inplace = True)
    GDP.columns = [str(x) for x in range(1956, 2016)]
    GDP.rename({'1956': 'Country', '1957': 'World Development Indicators',
               '1958': 'GDP at market prices (constant 2010 US$)', 
               '1959': 'NY.GDP.MKTP.KD'}, axis = "columns", inplace = True)
    GDP = GDP.set_index('Country')
    

    #Obtaining 'ScimEn' file and turning into a clean dataframe
    ScimEn = pd.read_excel('assets/scimagojr-3.xlsx')
    ScimEn = ScimEn.set_index('Country')
    
    #Merging the dataframes
    #Merging Energy with ScrimEn
    df = pd.merge(ScimEn, Energy, how = 'left', left_index = True, right_index = True)
    df = df.sort_values('Rank',ascending = True)
    
    #Merging df with GDP
    gdpCols = [str(x) for x in range(2006, 2016)]    
    df = pd.merge(df[0:15], GDP[gdpCols], how = 'left', left_index = True, right_index = True)
    df = df.sort_values('Rank')
    
    return df