In [50]:
%matplotlib inline

In [51]:
import pandas as pd
import numpy as np

# Importing the data from sources

## Inequality measures Deininger and Squire

Not sure whether the good quality criteria in Forbes deals with the variable Quality or Q. In order to be confident of our data, we impose the two conditions. 
This leads us to 679 points which is very close to the 682 points of the paper. 

We also add 6.6 we the data is based on expenditure and not on income (on the same cell to be sure that it is not done twice. 

In [52]:
DS = pd.read_excel("Deininger_and Squire.XLS", usecols=["Code", "Quality", "Year", "Gini", "Q", 'Inc'])
DS = DS.query("Q == 'good' and Quality == 'accept'")
DS.drop_duplicates(["Code", "Year"], inplace=True)
DS['Gini'].astype(float, inplace=True)

DS.loc[DS['Inc']=='E', 'Gini'] += 6.6

DS = DS[["Code", "Year", "Gini"]]
DS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 679 entries, 0 to 2631
Data columns (total 3 columns):
Code    679 non-null object
Year    679 non-null int64
Gini    679 non-null object
dtypes: int64(1), object(2)
memory usage: 21.2+ KB


## Educational data : Barro-Lee

We take the data directly from the BarroLee website. Cf list of name variable name.

In [53]:
male_educ = pd.read_csv("male_attainment_25_BarroLee.csv", usecols=["year", "WBcode", "yr_sch_sec"])
female_educ = pd.read_csv("female_attainment_25_BarroLee.csv", usecols=["year", "WBcode", "yr_sch_sec"])

In [54]:
print male_educ.info()
print female_educ.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1898 entries, 0 to 1897
Data columns (total 3 columns):
year          1898 non-null int64
yr_sch_sec    1898 non-null float64
WBcode        1898 non-null object
dtypes: float64(1), int64(1), object(1)
memory usage: 59.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1898 entries, 0 to 1897
Data columns (total 3 columns):
year          1898 non-null int64
yr_sch_sec    1898 non-null float64
WBcode        1898 non-null object
dtypes: float64(1), int64(1), object(1)
memory usage: 59.3+ KB
None


## Income

We downloaded the data from data.worldbank. The name of the variable has changed from GNP to GNI we should investigate to decide whether or not the differences are big. 
We secretly hope that the data before 1995 hasn't been changed and consequently that the new name of the variable affects nothing for our study. 

In [55]:
inc = pd.read_csv("GNI_per_capita_WB.csv", skiprows=4)
del inc["Country Name"], inc['Indicator Code'], inc['Indicator Name']
inc.set_index("Country Code", inplace=True)
inc = inc.stack().reset_index()
inc.columns = ["code", "year", "GNI_PC"]
print inc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9131 entries, 0 to 9130
Data columns (total 3 columns):
code      9131 non-null object
year      9131 non-null object
GNI_PC    9131 non-null float64
dtypes: float64(1), object(2)
memory usage: 285.3+ KB
None


## Price Level of Investment

Extracted from the Penn World Table 5.6 (as in Forbes). The name of the variable is PI (cf. documentation column number [15]). 

We now convert the Country name to the corresponding code using our good old dictionary. 

In [56]:
PPPI = pd.read_excel("pwt56_forweb.xls", usecols=["Country", "Year", "PI"]).dropna(subset=["PI"])
country_dict = pd.read_csv("../data_source/country_code_list.csv")

country_dict.loc[:, 'country'] = country_dict['country'].apply(lambda x: x.upper())
country_dict.set_index('country', inplace=True)

PPPI.replace("CAPE VERDE IS.", "CABO VERDE", inplace=True)
PPPI.replace("CENTRAL AFR.R.", "CENTRAL AFRICAN REPUBLIC", inplace=True)
PPPI.replace("GUINEA-BISS", "GUINEA-BISSAU", inplace=True)

PPPI = PPPI.query("Country != 'REUNION'")

PPPI = PPPI.query("Country != 'ZAIRE'")

PPPI.replace("DOMINICAN REP.", "DOMINICAN REPUBLIC", inplace=True)
PPPI.replace("ST.KITTS&NEVIS", "SAINT KITTS AND NEVIS", inplace=True)
PPPI.replace("ST.LUCIA", "SAINT LUCIA", inplace=True)
PPPI.replace("TRINIDAD&TOBAGO", "TRINIDAD AND TOBAGO", inplace=True)
PPPI.replace("U.S.A.", "UNITED STATES OF AMERICA", inplace=True)
PPPI.replace("KOREA, REP.", "SOUTH KOREA", inplace=True)
PPPI.replace("SYRIA", "SYRIAN ARAB REPUBLIC", inplace=True)
PPPI.replace("UNITED ARAB E.", "UNITED ARAB EMIRATES", inplace=True)
PPPI.replace("YEMEN", "REPUBLIC OF YEMEN", inplace=True)

PPPI = PPPI.query("Country != 'GERMANY, EAST'")

PPPI.replace("GERMANY, WEST", "GERMANY", inplace=True)
PPPI.replace("U.K.", "UNITED KINGDOM", inplace=True)
PPPI.replace("U.S.S.R.", "RUSSIAN FEDERATION", inplace=True)
PPPI.replace("PAPUA N.GUINEA", "PAPUA NEW GUINEA", inplace=True)
PPPI.replace("SOLOMON IS.", "SOLOMON ISLANDS", inplace=True)
PPPI.replace("ST.VINCENT&GRE", "SAINT VINCENT AND THE GRENADINES", inplace=True)
PPPI.replace("WESTERN SAMOA", "SAMOA", inplace=True)


PPPI['code'] = PPPI['Country'].apply(lambda x: country_dict.loc[x])
PPPI = PPPI[['code', 'Year', 'PI']]

In [57]:
PPPI.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4950 entries, 10 to 6533
Data columns (total 3 columns):
code    4950 non-null object
Year    4950 non-null int64
PI      4950 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 154.7+ KB


# Gathering the data in one frame

The first step is to remove rename the columns in preparation of the merging of the databases. 
We also remove all the duplicates in order to avoid the problems during the merging. 

### Drop duplicates and normalize the data from different sources

In [58]:
# rename the columns (in order to merge)
DS.columns = ["code", "year", "gini"]
PPPI.columns = ["code", "year", "PPPI"]
male_educ.columns = ["year", "sch_male", "code"]
female_educ.columns = ["year", "sch_female", "code"]

# drop duplicates in order to avoid problems during the merging
print "Income duplicates :", inc.duplicated(subset=['code', 'year']).sum()
print "Inequality duplicates :", DS.duplicated(subset=['code', 'year']).sum()
print "PPPI duplicates :", PPPI.duplicated(subset=['code', 'year']).sum()
print "Male educ duplicates :", male_educ.duplicated(subset=['code', 'year']).sum()
print "Female Educ duplicates :", female_educ.duplicated(subset=['code', 'year']).sum()

# normalize the data types
DS.loc[:, 'year'] = DS['year'].astype(int)
DS.loc[:, 'code'] = DS['code'].astype(str)
DS.loc[:, 'gini'] = DS['gini'].astype(float)

PPPI.loc[:, 'year'] = PPPI['year'].astype(int)
PPPI.loc[:, 'code'] = PPPI['code'].astype(str)
PPPI.loc[:, 'PPPI'] = PPPI['PPPI'].astype(float)

male_educ.loc[:, 'year'] = male_educ['year'].astype(int)
male_educ.loc[:, 'code'] = male_educ['code'].astype(str)
male_educ.loc[:, 'sch_male'] = male_educ['sch_male'].astype(float)

female_educ.loc[:, 'year'] = female_educ['year'].astype(int)
female_educ.loc[:, 'code'] = female_educ['code'].astype(str)
female_educ.loc[:, 'sch_female'] = female_educ['sch_female'].astype(float)

inc.loc[:, 'year'] = inc['year'].astype(int)
inc.loc[:, 'code'] = inc['code'].astype(str)
inc.loc[:, 'GNI_PC'] = inc['GNI_PC'].astype(float)

Income duplicates : 0
Inequality duplicates : 0
PPPI duplicates : 0
Male educ duplicates : 0
Female Educ duplicates : 0


We now have to make sure of the concordance country codes between bases...

We found some problematic codes in the income database : codes that do not appear in our good old country code dictionary. We see that the corresponding countries are not countries or negligeable countries so we can just drop them from the database. 

In [59]:
problematic_codes = list(set(inc.query("code not in " + str(country_dict['code'].values.tolist()))['code']))
country_code = pd.read_csv("GNI_per_capita_WB.csv", skiprows=4, usecols=['Country Name', 'Country Code'])
print country_code[country_code['Country Code'].apply(lambda x: x in problematic_codes)]
inc = inc.query("code in " + str(country_dict['code'].values.tolist()))

                                       Country Name Country Code
5                                        Arab World          ARB
34                   Central Europe and the Baltics          CEB
36                                  Channel Islands          CHI
46                           Caribbean small states          CSS
58            East Asia & Pacific (developing only)          EAP
59          East Asia & Pacific (all income levels)          EAS
60          Europe & Central Asia (developing only)          ECA
61        Europe & Central Asia (all income levels)          ECS
64                                        Euro area          EMU
69                                   European Union          EUU
70         Fragile and conflict affected situations          FCS
90                                      High income          HIC
93           Heavily indebted poor countries (HIPC)          HPC
119     Latin America & Caribbean (developing only)          LAC
125   Latin America & Car

In Dieninger and Squire some codes are not used anymore in the new nomenclature so we choose to change them to the new nomenclature. 

In [60]:
problematic_codes = list(set(DS.query("code not in " + str(country_dict['code'].values.tolist()))['code']))
country_code = pd.read_excel("Deininger_and Squire.XLS", usecols=["Code", "Country"])
print country_code[country_code['Code'].apply(lambda x: x in problematic_codes)].drop_duplicates()

DS.replace("BRS", "BLR", inplace=True)
DS.replace("CSR", "CZE", inplace=True)
DS.replace("KYR", "KGZ", inplace=True)
DS.replace("LAT", "LVA", inplace=True)
DS.replace("LIT", "LTU", inplace=True)
DS.replace("MLD", "MDA", inplace=True)
DS.replace("ROM", "ROU", inplace=True)
DS.replace("SLO", "SVK", inplace=True)
DS.replace("SVA", "SVN", inplace=True)
DS.replace("SUN", "RUS", inplace=True)
DS.replace("OAN", "TWN", inplace=True)

           Country Code
162        Belarus  BRS
580      Czech Rep  CSR
1407   Kyrgyz Rep.  KYR
1409        Latvia  LAT
1415     Lithuania  LIT
1541       Moldova  MLD
1914       Romania  ROM
1956        Slovak  SLO
1957   Slovak Rep.  SLO
1966      Slovenia  SVA
1981  Soviet Union  SUN
2151        Taiwan  OAN


In the educationnal data from Barro and Lee some codes are not used anymore in the new nomenclature so we choose to change them to the new nomenclature. 

In [61]:
problematic_codes = list(set(male_educ.query("code not in " + str(country_dict['code'].values.tolist()))['code']))
country_code = pd.read_csv("male_attainment_25_BarroLee.csv", usecols=["WBcode", "country"])
print country_code[country_code['WBcode'].apply(lambda x: x in problematic_codes)].drop_duplicates()

male_educ.replace("ROM", "MDA", inplace=True)
male_educ.replace("SER", "SRB", inplace=True)

                  country WBcode
1833  Republic of Moldova    ROM
1846               Serbia    SER


In [62]:
problematic_codes = list(set(female_educ.query("code not in " + str(country_dict['code'].values.tolist()))['code']))
country_code = pd.read_csv("female_attainment_25_BarroLee.csv", usecols=["WBcode", "country"])
print country_code[country_code['WBcode'].apply(lambda x: x in problematic_codes)].drop_duplicates()

male_educ.replace("ROM", "MDA", inplace=True)
male_educ.replace("SER", "SRB", inplace=True)

                  country WBcode
1833  Republic of Moldova    ROM
1846               Serbia    SER


### Merging the data

In [63]:
data_frame = pd.merge(DS, PPPI, on=['code', 'year'],how='outer')
data_frame = pd.merge(data_frame, male_educ, on=['code', 'year'], how='outer')
data_frame = pd.merge(data_frame, female_educ, on=['code', 'year'], how='outer')
data_frame = pd.merge(data_frame, inc, on=['code', 'year'], how='outer')

### Computing the growth column

In [64]:
data_frame.sort_values(['code', 'year'], inplace=True)
data_frame.set_index(['code'], inplace=True)

In [65]:
new_frame = pd.DataFrame()
for country in set(data_frame.index.values):
    sel = data_frame.loc[country]
    sel['growth'] = (sel.shift(-1)['GNI_PC'] - sel['GNI_PC']) / (sel.shift(-1)['year'] - sel['year'])
    new_frame = pd.concat([new_frame, sel])

In [66]:
data_frame = new_frame
del new_frame
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10011 entries, AGO to CUB
Data columns (total 7 columns):
year          10011 non-null float64
gini          679 non-null float64
PPPI          4950 non-null float64
sch_male      1898 non-null float64
sch_female    1898 non-null float64
GNI_PC        7690 non-null float64
growth        7481 non-null float64
dtypes: float64(7)
memory usage: 625.7+ KB


### Taking the log of the GDP

In [67]:
data_frame["log(GNI_PC)"] = data_frame['GNI_PC'].apply(np.log)
del data_frame['GNI_PC']

### Resampling the data

Complicated task because it is done in a complex way in the paper. This particular point could be critized btw. 
One has to read the paper carefully to obtain information on how the resampling is done (note 8). The author choose not to use the mean of all values in the period but only the first values which is not the good way to do in my opinion but here we just redo the calculations. Because the data on inequality is sparse, she allows the values to be not only the first one but the closest from the first one in the considered period. 

In [68]:
data_frame.reset_index(inplace=True)
period = float(5)
tuples = zip(data_frame['code'],
             (((data_frame['year'].values) // period) * period))
df_copy = data_frame.copy()
df_copy.index = pd.MultiIndex.from_tuples(tuples)
df_copy = df_copy.groupby(level=[0, 1]).first().dropna(how='all')
df_copy.index.names = ['code', 'year']

In [69]:
data_frame = df_copy.dropna(how='any')
del df_copy
data_frame = data_frame[['gini', 'PPPI', 'sch_male', 'sch_female', 'growth', 'log(GNI_PC)']]

### Removing countries that doesn't have at least 2 consecutives observations

In [71]:
new_frame = pd.DataFrame()
data_frame.reset_index(inplace=True)
for country in set(data_frame['code']):
    if len(data_frame.query("code == '" + str(country) + "'")) > 1:
        sel = data_frame.query("code == '" + str(country) + "'")
        if 0 in (sel.shift(-1).index.values - sel.index.values):
            new_frame = pd.concat([new_frame, sel])
data_frame = new_frame
del new_frame

In [72]:
data_frame.sort_values(['code', 'year']).to_csv("forbes_dataset.csv", index=False)

# Comparing with the data presented in the paper

In [73]:
pd.set_option('precision', 2)
resume = pd.DataFrame()
resume = pd.concat([resume, data_frame.groupby('year').mean()])
resume = pd.concat([resume, data_frame.groupby('year').std()])
resume = pd.concat([resume, data_frame.groupby('year').min()])
resume = pd.concat([resume, data_frame.groupby('year').max()])
del resume['code']
resume['categorize'] = (['mean'] * 7 + ['std'] * 7 + ['min'] * 7 + ['max'] * 7) 
result = resume.reset_index().set_index(['categorize', 'year']).stack(level=0).unstack(level=0).swaplevel(0,1).sort_index()
print result[['mean', 'std', 'min', 'max']]

categorize          mean     std      min      max
            year                                  
PPPI        1960   78.75   26.06    44.15   123.74
            1965   80.60   48.19    31.96   274.03
            1970   67.90   18.28    39.96   112.43
            1975   94.62   34.32    50.29   253.78
            1980   96.09   27.85    44.43   140.68
            1985   69.90   39.16    31.79   288.12
            1990   80.54   39.30    27.91   218.17
gini        1960   42.85    8.26    30.80    55.50
            1965   41.86    9.28    31.61    61.88
            1970   41.54    8.19    25.10    57.61
            1975   40.68    9.18    23.30    60.29
            1980   39.41    7.86    24.90    57.78
            1985   41.01    9.44    23.42    61.76
            1990   40.74    8.82    24.53    56.91
growth      1960   39.23   54.69   -30.00   140.00
            1965   89.09   93.09     0.00   310.00
            1970  160.61  214.46   -20.00  1070.00
            1975  282.37  302.1

In [78]:
print data_frame[['code', 'year', 'gini']].set_index(['code', 'year']).unstack(level=1)

       gini                                          
year   1960   1965   1970   1975   1980   1985   1990
code                                                 
AUS     NaN  32.02    NaN  34.33  39.96  37.58  41.72
BEL     NaN    NaN    NaN  28.25    NaN  26.22  26.92
BGD     NaN    NaN  36.00  33.34  39.00  37.00  34.87
BGR     NaN    NaN    NaN    NaN  25.01  23.42  24.53
BRA   53.00    NaN  57.61  60.29  57.78  61.76    NaN
CAN   30.80  31.61  32.24  31.62  31.80  32.81  27.56
CHL     NaN  45.64  46.00    NaN  53.21  57.88  56.49
CHN     NaN    NaN    NaN    NaN  32.00  31.40  34.60
COL     NaN    NaN  52.02  54.50    NaN  51.20  51.32
CRI   50.00    NaN  44.40  50.00  47.49  42.00    NaN
DEU     NaN    NaN  30.62  32.06  30.59    NaN    NaN
DNK     NaN    NaN    NaN  31.00  30.99  33.15  33.20
DOM     NaN    NaN    NaN  45.00  43.29    NaN  49.00
EGY     NaN  46.60    NaN  44.60    NaN    NaN  38.60
ESP     NaN  31.99  37.11    NaN  33.39  31.79    NaN
FIN     NaN  31.80  27.00  3