In [31]:
%matplotlib inline

In [32]:
import pandas as pd
import numpy as np
execfile('../toolbox.py')
pd.set_option("display.precision", 2)

### Importing the data and selecting variables

In [42]:
variables = ["gini_net_SWIID", "years_schooling", "GDP_PC_WB", "natural_ressources_WB"]

data = pd.read_csv("../data_source/all_data.csv", usecols=variables + ['code', 'year'], index_col=['code', 'year'])

data.reset_index(inplace=True)
data.sort_values(['code', 'year'], inplace=True)
data.set_index(['code'], inplace=True)
new_frame = pd.DataFrame()

# extrapolating years of schooling to yearly data
for country in set(data.index.values):
    sel = data.loc[country]
    if sel.shape != (len(variables) + 1,):
        try:
            sel.loc[:, 'years_schooling'] = sel['years_schooling'].interpolate()
        except TypeError:
            None
        new_frame = pd.concat([new_frame, sel])
data = new_frame.reset_index().set_index(['code', 'year']).sort_index(level=[0,1]).dropna(how='all')

# adding a economic shock data
# data['shock'] = 0
# data.loc[(data.index.levels[0].tolist(), 1974), 'shock'] = 1
# data.loc[(data.index.levels[0].tolist(), 1981), 'shock'] = 1
# data.loc[(data.index.levels[0].tolist(), 1982), 'shock'] = 1
# data.loc[(data.index.levels[0].tolist(), 2008), 'shock'] = 1
# data.loc[(data.index.levels[0].tolist(), 2009), 'shock'] = 1
print data.keys()

# computing growth
data['growth'] = growth(data, 'GDP_PC_WB', how='past', as_rate=True)

# renaming the columns
data.columns = ["gdp", "nat_re","schl", "gini", "g"]

print data.info()
print "Number of countries ", len(data.index.levels[0])

Index([u'GDP_PC_WB', u'natural_ressources_WB', u'years_schooling',
       u'gini_net_SWIID'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 11564 entries, (ABW, 1994.0) to (ZWE, 2014.0)
Data columns (total 5 columns):
gdp       8711 non-null float64
nat_re    6757 non-null float64
schl      9437 non-null float64
gini      4611 non-null float64
g         8499 non-null float64
dtypes: float64(5)
memory usage: 542.1+ KB
None
Number of countries  215


## One point per contry

In [43]:
countries = (data.notnull().groupby(level=0).sum() >= 10).all(axis=1)
mean_data = data.query("code in " + str(countries[countries].index.tolist())).groupby(level=0).mean()
mean_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 122 entries, ALB to ZWE
Data columns (total 5 columns):
gdp       122 non-null float64
nat_re    122 non-null float64
schl      122 non-null float64
gini      122 non-null float64
g         122 non-null float64
dtypes: float64(5)
memory usage: 5.7+ KB


In [44]:
QCA_data = quartilize_dataset(mean_data, 2)
for var in data.keys():
    QCA_data.loc[:, var] = QCA_data['qu_' + var]
    del QCA_data['qu_' + var]
QCA_data.to_csv("QCA_all_years.csv")

# 10 years periods

In [45]:
data_ten_y = resample(data, 10, 5).dropna()
data_ten_y.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 325 entries, (ALB, 2005.0) to (ZWE, 2005.0)
Data columns (total 5 columns):
gdp       325 non-null float64
nat_re    325 non-null float64
schl      325 non-null float64
gini      325 non-null float64
g         325 non-null float64
dtypes: float64(5)
memory usage: 15.2+ KB


## Median

In [46]:
for var in data_ten_y.keys():
    print var, " median ", data_ten_y[var].median()
QCA_data = quartilize_periodwise(data_ten_y, 2)
QCA_data.index = QCA_data.index.to_native_types()
QCA_data.index = map(lambda x: x[0] + x[1][2:], QCA_data.index.tolist())
QCA_data.index.name = "index"
QCA_data.to_csv("QCA_10y.csv")

gdp  median  2453.55470151
nat_re  median  2.91926963237
schl  median  7.201
gini  median  0.374208079338
g  median  0.0513134008407


## 3 categories

In [49]:
QCA_data = quartilize_periodwise(data_ten_y, [2, 2, 2, 2, 2, 2])
QCA_data.index = QCA_data.index.to_native_types()
QCA_data.index = map(lambda x: x[0] + " " + x[1][2:], QCA_data.index.tolist())
QCA_data.index.name = "index"
#QCA_data.loc[:, 'g1h'] = (QCA_data['g'] == 2).apply(int)
#QCA_data.loc[:, 'g3l'] = (QCA_data['g'] == 0).apply(int)
QCA_data.to_csv("QCA_10y_3c.csv")