In [217]:
%matplotlib inline

In [218]:
import pandas as pd
import numpy as np

# Importing the data from sources

## Inequality measures Deininger and Squire

Not sure whether the good quality criteria in Forbes deals with the variable Quality or Q. In order to be confident of our data, we impose the two conditions. 
This leads us to 679 points which is very close to the 682 points of the paper. 

We also add 6.6 we the data is based on expenditure and not on income (on the same cell to be sure that it is not done twice. 

In [219]:
DS = pd.read_excel("Deininger_and Squire.XLS", usecols=["Code", "Quality", "Year", "Gini", "Q", 'Inc'])
DS = DS.query("Q == 'good' and Quality == 'accept'")
DS.drop_duplicates(["Code", "Year"], inplace=True)
DS['Gini'].astype(float, inplace=True)

DS.loc[DS['Inc']=='E', 'Gini'] += 6.6

DS = DS[["Code", "Year", "Gini"]]
DS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 679 entries, 0 to 2631
Data columns (total 3 columns):
Code    679 non-null object
Year    679 non-null int64
Gini    679 non-null object
dtypes: int64(1), object(2)
memory usage: 21.2+ KB


## Educational data : Barro-Lee

We take the data directly from the BarroLee website. Cf list of name variable name.

In [220]:
male_educ = pd.read_csv("male_attainment_25_BarroLee.csv", usecols=["year", "WBcode", "yr_sch_sec"])
female_educ = pd.read_csv("female_attainment_25_BarroLee.csv", usecols=["year", "WBcode", "yr_sch_sec"])

In [221]:
print male_educ.info()
print female_educ.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1898 entries, 0 to 1897
Data columns (total 3 columns):
year          1898 non-null int64
yr_sch_sec    1898 non-null float64
WBcode        1898 non-null object
dtypes: float64(1), int64(1), object(1)
memory usage: 59.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1898 entries, 0 to 1897
Data columns (total 3 columns):
year          1898 non-null int64
yr_sch_sec    1898 non-null float64
WBcode        1898 non-null object
dtypes: float64(1), int64(1), object(1)
memory usage: 59.3+ KB
None


## Income

We downloaded the data from data.worldbank. The name of the variable has changed from GNP to GNI we should investigate to decide whether or not the differences are big. 
We secretly hope that the data before 1995 hasn't been changed and consequently that the new name of the variable affects nothing for our study. 

In [222]:
inc = pd.read_csv("GNI_per_capita_WB.csv", skiprows=4)
del inc["Country Name"], inc['Indicator Code'], inc['Indicator Name']
inc.set_index("Country Code", inplace=True)
inc = inc.stack().reset_index()
inc.columns = ["code", "year", "GNI_PC"]
print inc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9131 entries, 0 to 9130
Data columns (total 3 columns):
code      9131 non-null object
year      9131 non-null object
GNI_PC    9131 non-null float64
dtypes: float64(1), object(2)
memory usage: 285.3+ KB
None


## Price Level of Investment

Extracted from the Penn World Table 5.6 (as in Forbes). The name of the variable is PI (cf. documentation column number [15]). 

We now convert the Country name to the corresponding code using our good old dictionary. 

In [223]:
PPPI = pd.read_excel("pwt56_forweb.xls", usecols=["Country", "Year", "PI"]).dropna(subset=["PI"])
country_dict = pd.read_csv("../data_source/country_code_list.csv")

country_dict.loc[:, 'country'] = country_dict['country'].apply(lambda x: x.upper())
country_dict.set_index('country', inplace=True)

PPPI.replace("CAPE VERDE IS.", "CABO VERDE", inplace=True)
PPPI.replace("CENTRAL AFR.R.", "CENTRAL AFRICAN REPUBLIC", inplace=True)
PPPI.replace("GUINEA-BISS", "GUINEA-BISSAU", inplace=True)

PPPI = PPPI.query("Country != 'REUNION'")

PPPI = PPPI.query("Country != 'ZAIRE'")

PPPI.replace("DOMINICAN REP.", "DOMINICAN REPUBLIC", inplace=True)
PPPI.replace("ST.KITTS&NEVIS", "SAINT KITTS AND NEVIS", inplace=True)
PPPI.replace("ST.LUCIA", "SAINT LUCIA", inplace=True)
PPPI.replace("TRINIDAD&TOBAGO", "TRINIDAD AND TOBAGO", inplace=True)
PPPI.replace("U.S.A.", "UNITED STATES OF AMERICA", inplace=True)
PPPI.replace("KOREA, REP.", "SOUTH KOREA", inplace=True)
PPPI.replace("SYRIA", "SYRIAN ARAB REPUBLIC", inplace=True)
PPPI.replace("UNITED ARAB E.", "UNITED ARAB EMIRATES", inplace=True)
PPPI.replace("YEMEN", "REPUBLIC OF YEMEN", inplace=True)

PPPI = PPPI.query("Country != 'GERMANY, EAST'")

PPPI.replace("GERMANY, WEST", "GERMANY", inplace=True)
PPPI.replace("U.K.", "UNITED KINGDOM", inplace=True)
PPPI.replace("U.S.S.R.", "RUSSIAN FEDERATION", inplace=True)
PPPI.replace("PAPUA N.GUINEA", "PAPUA NEW GUINEA", inplace=True)
PPPI.replace("SOLOMON IS.", "SOLOMON ISLANDS", inplace=True)
PPPI.replace("ST.VINCENT&GRE", "SAINT VINCENT AND THE GRENADINES", inplace=True)
PPPI.replace("WESTERN SAMOA", "SAMOA", inplace=True)


PPPI['code'] = PPPI['Country'].apply(lambda x: country_dict.loc[x])
PPPI = PPPI[['code', 'Year', 'PI']]

In [224]:
PPPI.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4950 entries, 10 to 6533
Data columns (total 3 columns):
code    4950 non-null object
Year    4950 non-null int64
PI      4950 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 154.7+ KB


# Gathering the data in one frame

The first step is to remove rename the columns in preparation of the merging of the databases. 
We also remove all the duplicates in order to avoid the problems during the merging. 

### Drop duplicates and normalize the data from different sources

In [225]:
# rename the columns (in order to merge)
DS.columns = ["code", "year", "gini"]
PPPI.columns = ["code", "year", "PPPI"]
male_educ.columns = ["year", "sch_male", "code"]
female_educ.columns = ["year", "sch_female", "code"]

# drop duplicates in order to avoid problems during the merging
print "Income duplicates :", inc.duplicated(subset=['code', 'year']).sum()
print "Inequality duplicates :", DS.duplicated(subset=['code', 'year']).sum()
print "PPPI duplicates :", PPPI.duplicated(subset=['code', 'year']).sum()
print "Male educ duplicates :", male_educ.duplicated(subset=['code', 'year']).sum()
print "Female Educ duplicates :", female_educ.duplicated(subset=['code', 'year']).sum()

# normalize the data types
DS.loc[:, 'year'] = DS['year'].astype(int)
DS.loc[:, 'code'] = DS['code'].astype(str)
DS.loc[:, 'gini'] = DS['gini'].astype(float)

PPPI.loc[:, 'year'] = PPPI['year'].astype(int)
PPPI.loc[:, 'code'] = PPPI['code'].astype(str)
PPPI.loc[:, 'PPPI'] = PPPI['PPPI'].astype(float)

male_educ.loc[:, 'year'] = male_educ['year'].astype(int)
male_educ.loc[:, 'code'] = male_educ['code'].astype(str)
male_educ.loc[:, 'sch_male'] = male_educ['sch_male'].astype(float)

female_educ.loc[:, 'year'] = female_educ['year'].astype(int)
female_educ.loc[:, 'code'] = female_educ['code'].astype(str)
female_educ.loc[:, 'sch_female'] = female_educ['sch_female'].astype(float)

inc.loc[:, 'year'] = inc['year'].astype(int)
inc.loc[:, 'code'] = inc['code'].astype(str)
inc.loc[:, 'GNI_PC'] = inc['GNI_PC'].astype(float)

Income duplicates : 0
Inequality duplicates : 0
PPPI duplicates : 0
Male educ duplicates : 0
Female Educ duplicates : 0


We now have to make sure of the concordance country codes between bases...

We found some problematic codes in the income database : codes that do not appear in our good old country code dictionary. We see that the corresponding countries are not countries or negligeable countries so we can just drop them from the database. 

In [226]:
problematic_codes = list(set(inc.query("code not in " + str(country_dict['code'].values.tolist()))['code']))
country_code = pd.read_csv("GNI_per_capita_WB.csv", skiprows=4, usecols=['Country Name', 'Country Code'])
print country_code[country_code['Country Code'].apply(lambda x: x in problematic_codes)]
inc = inc.query("code in " + str(country_dict['code'].values.tolist()))

                                       Country Name Country Code
5                                        Arab World          ARB
34                   Central Europe and the Baltics          CEB
36                                  Channel Islands          CHI
46                           Caribbean small states          CSS
58            East Asia & Pacific (developing only)          EAP
59          East Asia & Pacific (all income levels)          EAS
60          Europe & Central Asia (developing only)          ECA
61        Europe & Central Asia (all income levels)          ECS
64                                        Euro area          EMU
69                                   European Union          EUU
70         Fragile and conflict affected situations          FCS
90                                      High income          HIC
93           Heavily indebted poor countries (HIPC)          HPC
119     Latin America & Caribbean (developing only)          LAC
125   Latin America & Car

In Dieninger and Squire some codes are not used anymore in the new nomenclature so we choose to change them to the new nomenclature. 

In [227]:
problematic_codes = list(set(DS.query("code not in " + str(country_dict['code'].values.tolist()))['code']))
country_code = pd.read_excel("Deininger_and Squire.XLS", usecols=["Code", "Country"])
print country_code[country_code['Code'].apply(lambda x: x in problematic_codes)].drop_duplicates()

DS.replace("BRS", "BLR", inplace=True)
DS.replace("CSR", "CZE", inplace=True)
DS.replace("KYR", "KGZ", inplace=True)
DS.replace("LAT", "LVA", inplace=True)
DS.replace("LIT", "LTU", inplace=True)
DS.replace("MLD", "MDA", inplace=True)
DS.replace("ROM", "ROU", inplace=True)
DS.replace("SLO", "SVK", inplace=True)
DS.replace("SVA", "SVN", inplace=True)
DS.replace("SUN", "RUS", inplace=True)
DS.replace("OAN", "TWN", inplace=True)

           Country Code
162        Belarus  BRS
580      Czech Rep  CSR
1407   Kyrgyz Rep.  KYR
1409        Latvia  LAT
1415     Lithuania  LIT
1541       Moldova  MLD
1914       Romania  ROM
1956        Slovak  SLO
1957   Slovak Rep.  SLO
1966      Slovenia  SVA
1981  Soviet Union  SUN
2151        Taiwan  OAN


In the educationnal data from Barro and Lee some codes are not used anymore in the new nomenclature so we choose to change them to the new nomenclature. 

In [228]:
problematic_codes = list(set(male_educ.query("code not in " + str(country_dict['code'].values.tolist()))['code']))
country_code = pd.read_csv("male_attainment_25_BarroLee.csv", usecols=["WBcode", "country"])
print country_code[country_code['WBcode'].apply(lambda x: x in problematic_codes)].drop_duplicates()

male_educ.replace("ROM", "MDA", inplace=True)
male_educ.replace("SER", "SRB", inplace=True)

                  country WBcode
1833  Republic of Moldova    ROM
1846               Serbia    SER


In [229]:
problematic_codes = list(set(female_educ.query("code not in " + str(country_dict['code'].values.tolist()))['code']))
country_code = pd.read_csv("female_attainment_25_BarroLee.csv", usecols=["WBcode", "country"])
print country_code[country_code['WBcode'].apply(lambda x: x in problematic_codes)].drop_duplicates()

male_educ.replace("ROM", "MDA", inplace=True)
male_educ.replace("SER", "SRB", inplace=True)

                  country WBcode
1833  Republic of Moldova    ROM
1846               Serbia    SER


### Merging the data

In [249]:
data_frame = pd.merge(DS, PPPI, on=['code', 'year'],how='outer')
data_frame = pd.merge(data_frame, male_educ, on=['code', 'year'], how='outer')
data_frame = pd.merge(data_frame, female_educ, on=['code', 'year'], how='outer')
data_frame = pd.merge(data_frame, inc, on=['code', 'year'], how='outer')

### Taking the log of the GDP

In [250]:
data_frame["log(GNI_PC)"] = data_frame['GNI_PC'].apply(np.log)
del data_frame['GNI_PC']

### Resampling the data

Complicated task because it is done in a complex way in the paper. This particular point could be critized btw. 
One has to read the paper carefully to obtain information on how the resampling is done (note 8). The author choose not to use the mean of all values in the period but only the first values which is not the good way to do in my opinion but here we just redo the calculations. Because the data on inequality is sparse, she allows the values to be not only the first one but the closest from the first one in the considered period. 

In [251]:
data_frame.reset_index(inplace=True)
period = float(5)
tuples = zip(data_frame['code'],
             (((data_frame['year'].values - 1) // period) * period))
df_copy = data_frame.copy()
df_copy.index = pd.MultiIndex.from_tuples(tuples)
df_copy = df_copy.groupby(level=[0, 1]).last().dropna(how='all')
df_copy.index.names = ['code', 'year']

In [252]:
data_frame = df_copy
del df_copy
data_frame = data_frame[['gini', 'PPPI', 'sch_male', 'sch_female', 'log(GNI_PC)']]

In [253]:
data_frame.loc['DEU']

Unnamed: 0_level_0,gini,PPPI,sch_male,sch_female,log(GNI_PC)
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1945,,55.3,0.69,0.47,
1950,,57.31,0.79,0.5,
1955,,57.94,0.83,0.52,
1960,28.13,73.64,0.88,0.57,
1965,33.57,75.79,0.92,0.55,
1970,30.6163,98.88,1.04,0.65,8.8143
1975,32.0648,120.43,1.51,0.88,9.475
1980,32.195,67.51,2.03,1.12,9.1952
1985,,110.23,3.28,2.26,9.9683
1990,,117.63,4.47,3.19,10.223


### Computing the growth column

In [254]:
data_frame.reset_index(inplace=True)
data_frame.sort_values(['code', 'year'], inplace=True)
data_frame.set_index(['code'], inplace=True)

In [255]:
new_frame = pd.DataFrame()
for country in set(data_frame.index.values):
    sel = data_frame.loc[country]
    if sel.shape != (6,):
        sel['growth'] = (sel.shift(-1)['log(GNI_PC)'] - sel['log(GNI_PC)']) / (sel.shift(-1)['year'] - sel['year'])
        new_frame = pd.concat([new_frame, sel])

In [258]:
new_frame.loc['DEU']

Unnamed: 0_level_0,year,gini,PPPI,sch_male,sch_female,log(GNI_PC),growth
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DEU,1945,,55.3,0.69,0.47,,
DEU,1950,,57.31,0.79,0.5,,
DEU,1955,,57.94,0.83,0.52,,
DEU,1960,28.13,73.64,0.88,0.57,,
DEU,1965,33.57,75.79,0.92,0.55,,
DEU,1970,30.6163,98.88,1.04,0.65,8.8143,0.1321
DEU,1975,32.0648,120.43,1.51,0.88,9.475,-0.056
DEU,1980,32.195,67.51,2.03,1.12,9.1952,0.1546
DEU,1985,,110.23,3.28,2.26,9.9683,0.0509
DEU,1990,,117.63,4.47,3.19,10.223,-0.0048


In [259]:
data_frame = new_frame.dropna(how='any')
del new_frame
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 261 entries, DZA to UGA
Data columns (total 7 columns):
year           261 non-null float64
gini           261 non-null float64
PPPI           261 non-null float64
sch_male       261 non-null float64
sch_female     261 non-null float64
log(GNI_PC)    261 non-null float64
growth         261 non-null float64
dtypes: float64(7)
memory usage: 16.3+ KB


### Removing countries that doesn't have at least 2 consecutives observations

In [260]:
new_frame = pd.DataFrame()
data_frame.reset_index(inplace=True)
for country in set(data_frame['code']):
    if len(data_frame.query("code == '" + str(country) + "'")) > 1:
        sel = data_frame.query("code == '" + str(country) + "'")
        sel = sel.loc[((sel.shift(-1).year - sel.year) == 5) | ((sel.year - sel.shift(1).year) == 5)]
        new_frame = pd.concat([new_frame, sel])
data_frame = new_frame
del new_frame

In [269]:
data_frame.sort_values(['code', 'year']).to_csv("forbes_dataset.csv", index=False)

# Comparing with the data presented in the paper

In [271]:
pd.set_option('precision', 4)
nb_per = len(set(data_frame.year))
resume = pd.DataFrame()
resume = pd.concat([resume, data_frame.groupby('year').mean()])
resume = pd.concat([resume, data_frame.groupby('year').std()])
resume = pd.concat([resume, data_frame.groupby('year').min()])
resume = pd.concat([resume, data_frame.groupby('year').max()])
del resume['code']
resume['categorize'] = (['mean'] * nb_per + ['std'] * nb_per + ['min'] * nb_per + ['max'] * nb_per) 
result = resume.reset_index().set_index(['categorize', 'year']).stack(level=0).unstack(level=0).swaplevel(0,1).sort_index()
print result[['mean', 'std', 'min', 'max']]

categorize           mean      std      min       max
            year                                     
PPPI        1960  80.2700  20.3394  53.2000  119.1600
            1965  67.5840  19.8137  41.1500  107.0600
            1970  85.5300  24.4405  36.4500  139.5800
            1975  96.6863  30.2491  35.3300  187.2600
            1980  66.8415  23.8130  31.8600  162.8500
            1985  81.4251  37.9283  27.9100  218.1700
            1990  74.6059  37.5481  15.4200  177.4000
gini        1960  40.1430   7.5214  31.6100   55.5000
            1965  41.5180   9.7666  25.1000   57.7000
            1970  40.9775   9.4214  23.3000   61.9400
            1975  39.1565   8.8489  24.9000   63.1800
            1980  38.5449   8.0704  23.4200   61.7600
            1985  40.8250   8.7590  24.5300   59.6000
            1990  40.5100   8.9360  26.1118   56.9100
growth      1960   0.0610   0.0268   0.0174    0.0939
            1965   0.1488   0.0846   0.0129    0.3184
            1970   0.1153   

In [272]:
print data_frame[['code', 'year', 'gini']].set_index(['code', 'year']).unstack(level=1)

       gini                                                      
year   1960     1965     1970     1975     1980     1985     1990
code                                                             
AUS     NaN      NaN      NaN  39.3300  37.5800  41.7200      NaN
BEL     NaN      NaN      NaN  28.2500  26.2221      NaN      NaN
BGD     NaN      NaN  36.0000  35.1700  36.0000  35.4500  34.8700
BGR     NaN      NaN      NaN      NaN  23.4200  24.5300  34.4200
BRA     NaN  57.6100  61.9400  57.7800  61.7600  59.6000      NaN
CAN   31.61  32.3000  31.6200  31.0000  32.8100  27.5600  27.6500
CHL     NaN  45.6400  46.0000  53.2100      NaN  57.8800  56.4900
CHN     NaN      NaN      NaN  32.0000  31.4000  34.6000  37.8000
CIV     NaN      NaN      NaN      NaN  47.8100  43.4900      NaN
COL     NaN  52.0200  46.0000  54.5000      NaN  51.2000  51.3200
CRI     NaN      NaN  44.4000  45.0000  47.0000  46.0700      NaN
DEU     NaN      NaN  30.6163  32.0648  32.1950      NaN      NaN
DNK     Na