In [1]:
import numpy as np
import pandas as pd

# Integrated Data

In [4]:
data = pd.read_csv("input/Integrated_data.csv", low_memory = False)

# Load up the list of variables we want to aggregate

In [5]:
variables = pd.read_csv("input/IVS_Variable_List.csv")

In [6]:
variables_to_keep = variables[variables.Included == 1].Name

In [8]:
variables_to_keep

0           A001
1           A002
4           A005
8           A009
29          A029
          ...   
1353        X049
1356        X051
1357        X052
1358        X053
1375    ZMEN_010
Name: Name, Length: 81, dtype: object

In [9]:
data_subset = data.loc[: , data.columns.isin(variables_to_keep)].copy()

In [10]:
data_subset

Unnamed: 0,S003,S017,S020,A001,A002,A005,A009,A029,A032,A042,...,X026,X028,X036,X045,X047,X048,X049,X051,X052,X053
0,32,0.926626,1984,-4,-4,-4,4,0,1,1,...,-2,4,33,-2,-4,-4,-4,-4,-4,-4
1,32,0.926626,1984,-4,-4,-4,4,0,0,0,...,-2,4,-3,2,-4,-4,-4,-4,-4,-4
2,32,0.926626,1984,-4,-4,-4,3,0,0,0,...,-2,1,33,2,-4,-4,-4,-4,-4,-4
3,32,1.352874,1984,-4,-4,-4,2,1,1,0,...,1,7,-3,2,-4,-4,-4,-4,-4,-4
4,32,0.926626,1984,-4,-4,-4,2,1,0,0,...,-2,5,-3,3,-4,-4,-4,-4,-4,-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513524,915,0.782164,2008,1,1,1,2,1,1,1,...,-4,1,-4,-4,-4,-4,-5,-4,-4,-4
513525,915,0.887335,2008,1,2,2,4,0,1,0,...,-4,1,-4,-4,-4,-4,-5,-4,-4,-4
513526,915,0.782164,2008,1,2,2,2,1,0,1,...,-4,1,-4,-4,-4,-4,-5,-4,-4,-4
513527,915,1.201044,2008,1,1,1,3,0,1,1,...,-4,1,-4,-4,-4,-4,-5,-4,-4,-4


In [11]:
data_subset.S020.isnull().sum()

0

## One-hot encoding

In [12]:
not_to_encode = ["S003", "S020", "S017", "X048"] # are the variables we want to groupby and the weights (S017)

In [13]:
one_hot = pd.get_dummies(data=data_subset.loc[: , [c for c in data_subset.columns if c not in not_to_encode] ], 
                           columns=[c for c in data_subset.columns if c not in not_to_encode])

In [14]:
one_hot

Unnamed: 0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513524,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
513525,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
513526,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
513527,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Multiply by the weights

In [15]:
weighted_one_hot = one_hot.mul(data_subset.S017, axis=0).copy()

In [16]:
weighted_one_hot

Unnamed: 0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,0.0,0.926626,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.926626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.926626,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.926626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.926626,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.926626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.352874,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.352874,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.926626,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.926626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513524,0.0,0.000000,0.0,0.0,0.782164,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
513525,0.0,0.000000,0.0,0.0,0.887335,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
513526,0.0,0.000000,0.0,0.0,0.782164,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
513527,0.0,0.000000,0.0,0.0,1.201044,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Groupby Country

### For country statistics we can correct the numbers using S007

In [17]:
weighted_one_hot["S003"] = data_subset["S003"]
weighted_one_hot["Year"] = data_subset["S020"]

In [19]:
grouped_by_country = weighted_one_hot.groupby(["S003", "Year"]).mean().copy()

In [20]:
grouped_by_country

Unnamed: 0_level_0,Unnamed: 1_level_0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
S003,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
8,1998,0.0,0.0,0.000000,0.005005,0.957958,0.032032,0.003003,0.002002,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
8,2002,0.0,0.0,0.000000,0.002000,0.959000,0.031000,0.008000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
8,2008,0.0,0.0,0.003656,0.000000,0.895823,0.097489,0.003032,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
12,2002,0.0,0.0,0.000000,0.003900,0.943838,0.046802,0.004680,0.000780,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
12,2014,0.0,0.0,0.000833,0.003333,0.930833,0.041667,0.011667,0.011667,0.0,0.0,...,0.104167,0.020833,0.031667,0.0325,0.0925,0.0275,0.0375,0.025,0.023333,0.086667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909,1990,0.0,0.0,0.000000,0.000000,0.944079,0.042763,0.009868,0.003289,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
909,1999,0.0,0.0,0.003159,0.000000,0.901173,0.080325,0.011282,0.004061,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
909,2008,0.0,0.0,0.000000,0.000000,0.920928,0.051470,0.004528,0.013074,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
914,1998,0.0,0.0,0.000000,0.000000,0.967500,0.030000,0.001250,0.001250,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000


### Lets add the country names

In [21]:
country_ISO = pd.read_csv("input/ISO_3611.csv", index_col = "numeric")

In [22]:
country_ISO

Unnamed: 0_level_0,Country,alpha-2,alpha-3
numeric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,Afghanistan,AF,AFG
8,Albania,AL,ALB
12,Algeria,DZ,DZA
16,American Samoa,AS,ASM
20,Andorra,AD,AND
...,...,...,...
101,Republika Srpska,,
197,Northern Cyprus,,
909,North Ireland,,Nan
914,Bosnia,,


In [23]:
names = []
ISO3 = []

for idx in grouped_by_country.index:
    names.append(country_ISO.loc[idx[0], "Country"])
    ISO3.append(country_ISO.loc[idx[0], "alpha-3"])

In [24]:
grouped_by_country["Country Name"] = names
grouped_by_country["Country ISO3"] = ISO3

In [25]:
grouped_by_country

Unnamed: 0_level_0,Unnamed: 1_level_0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10,Country Name,Country ISO3
S003,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
8,1998,0.0,0.0,0.000000,0.005005,0.957958,0.032032,0.003003,0.002002,0.0,0.0,...,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000,Albania,ALB
8,2002,0.0,0.0,0.000000,0.002000,0.959000,0.031000,0.008000,0.000000,0.0,0.0,...,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000,Albania,ALB
8,2008,0.0,0.0,0.003656,0.000000,0.895823,0.097489,0.003032,0.000000,0.0,0.0,...,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000,Albania,ALB
12,2002,0.0,0.0,0.000000,0.003900,0.943838,0.046802,0.004680,0.000780,0.0,0.0,...,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000,Algeria,DZA
12,2014,0.0,0.0,0.000833,0.003333,0.930833,0.041667,0.011667,0.011667,0.0,0.0,...,0.031667,0.0325,0.0925,0.0275,0.0375,0.025,0.023333,0.086667,Algeria,DZA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909,1990,0.0,0.0,0.000000,0.000000,0.944079,0.042763,0.009868,0.003289,0.0,0.0,...,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000,North Ireland,Nan
909,1999,0.0,0.0,0.003159,0.000000,0.901173,0.080325,0.011282,0.004061,0.0,0.0,...,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000,North Ireland,Nan
909,2008,0.0,0.0,0.000000,0.000000,0.920928,0.051470,0.004528,0.013074,0.0,0.0,...,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000,North Ireland,Nan
914,1998,0.0,0.0,0.000000,0.000000,0.967500,0.030000,0.001250,0.001250,0.0,0.0,...,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000,Bosnia,


### Move the last columns to the front

In [26]:
columns = list(grouped_by_country.columns)

In [27]:
grouped_by_country = grouped_by_country[columns[-2:] + columns[:-2]]

In [28]:
grouped_by_country

Unnamed: 0_level_0,Unnamed: 1_level_0,Country Name,Country ISO3,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
S003,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
8,1998,Albania,ALB,0.0,0.0,0.000000,0.005005,0.957958,0.032032,0.003003,0.002002,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
8,2002,Albania,ALB,0.0,0.0,0.000000,0.002000,0.959000,0.031000,0.008000,0.000000,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
8,2008,Albania,ALB,0.0,0.0,0.003656,0.000000,0.895823,0.097489,0.003032,0.000000,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
12,2002,Algeria,DZA,0.0,0.0,0.000000,0.003900,0.943838,0.046802,0.004680,0.000780,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
12,2014,Algeria,DZA,0.0,0.0,0.000833,0.003333,0.930833,0.041667,0.011667,0.011667,...,0.104167,0.020833,0.031667,0.0325,0.0925,0.0275,0.0375,0.025,0.023333,0.086667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909,1990,North Ireland,Nan,0.0,0.0,0.000000,0.000000,0.944079,0.042763,0.009868,0.003289,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
909,1999,North Ireland,Nan,0.0,0.0,0.003159,0.000000,0.901173,0.080325,0.011282,0.004061,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
909,2008,North Ireland,Nan,0.0,0.0,0.000000,0.000000,0.920928,0.051470,0.004528,0.013074,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
914,1998,Bosnia,,0.0,0.0,0.000000,0.000000,0.967500,0.030000,0.001250,0.001250,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000


### Flat the index

In [29]:
grouped_by_country = grouped_by_country.reset_index()
grouped_by_country.drop("S003", axis = 1, inplace = True)

In [30]:
grouped_by_country

Unnamed: 0,Year,Country Name,Country ISO3,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,1998,Albania,ALB,0.0,0.0,0.000000,0.005005,0.957958,0.032032,0.003003,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
1,2002,Albania,ALB,0.0,0.0,0.000000,0.002000,0.959000,0.031000,0.008000,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
2,2008,Albania,ALB,0.0,0.0,0.003656,0.000000,0.895823,0.097489,0.003032,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
3,2002,Algeria,DZA,0.0,0.0,0.000000,0.003900,0.943838,0.046802,0.004680,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
4,2014,Algeria,DZA,0.0,0.0,0.000833,0.003333,0.930833,0.041667,0.011667,...,0.104167,0.020833,0.031667,0.0325,0.0925,0.0275,0.0375,0.025,0.023333,0.086667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,1990,North Ireland,Nan,0.0,0.0,0.000000,0.000000,0.944079,0.042763,0.009868,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
360,1999,North Ireland,Nan,0.0,0.0,0.003159,0.000000,0.901173,0.080325,0.011282,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
361,2008,North Ireland,Nan,0.0,0.0,0.000000,0.000000,0.920928,0.051470,0.004528,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
362,1998,Bosnia,,0.0,0.0,0.000000,0.000000,0.967500,0.030000,0.001250,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000


### Save to csv

In [31]:
grouped_by_country.to_csv("out/IVS_grouped_by_country.csv", index = False)

# Groupby Country and Region

### For regional aggregation we can not use the weighted answers

In [34]:
one_hot["S003"] = data_subset["S003"]
one_hot["Year"] = data_subset["S020"]
one_hot["X048"] = data_subset["X048"]

In [35]:
grouped_by_country_region = one_hot.groupby(["S003", "Year", "X048"]).mean().copy()

In [36]:
grouped_by_country_region

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
S003,Year,X048,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
8,1998,-4,0.0,0.0,0.000000,0.005005,0.957958,0.032032,0.003003,0.002002,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
8,2002,-4,0.0,0.0,0.000000,0.002000,0.959000,0.031000,0.008000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
8,2008,-4,0.0,0.0,0.003911,0.000000,0.906128,0.086701,0.003259,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
12,2002,-4,0.0,0.0,0.000000,0.003900,0.943838,0.046802,0.004680,0.000780,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
12,2014,-4,0.0,0.0,0.000833,0.003333,0.930833,0.041667,0.011667,0.011667,0.0,0.0,...,0.104167,0.020833,0.031667,0.0325,0.0925,0.0275,0.0375,0.025,0.023333,0.086667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909,1990,-5,0.0,0.0,0.000000,0.000000,0.944079,0.042763,0.009868,0.003289,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
909,1999,-4,0.0,0.0,0.003000,0.000000,0.879000,0.094000,0.017000,0.007000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
909,2008,-4,0.0,0.0,0.000000,0.000000,0.934000,0.050000,0.006000,0.010000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000
914,1998,-4,0.0,0.0,0.000000,0.000000,0.967500,0.030000,0.001250,0.001250,0.0,0.0,...,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000,0.000000


### Add the country and region names

In [37]:
region_codes = pd.read_csv("input/Region_codes.csv", index_col="Code")

In [38]:
region_codes

Unnamed: 0_level_0,Region
Code,Unnamed: 1_level_1
-5,Unknown
-4,Not asked
-3,Not applicable
-2,No answer
-1,Don´t know
...,...
1701040,CO: Oriental
7360010,SD: white nile
7360011,SD: north darfur
7360012,SD: west kurdufan


In [39]:
region_name = []
r_names = []
r_ISO3 = []


for idx in grouped_by_country_region.index:
    region_name.append(region_codes.loc[idx[2], "Region"])
    r_names.append(country_ISO.loc[idx[0], "Country"])
    r_ISO3.append(country_ISO.loc[idx[0], "alpha-3"])
    

In [40]:
grouped_by_country_region["Country Name"] = r_names
grouped_by_country_region["Country ISO3"] = r_ISO3
grouped_by_country_region["Region"] = region_name

### Move the last column to the front

In [41]:
columns_r = list(grouped_by_country_region.columns)

In [42]:
grouped_by_country_region = grouped_by_country_region[columns_r[-3:] + columns[:-3]]

In [43]:
grouped_by_country_region

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Country Name,Country ISO3,Region,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,...,X053_-1,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9
S003,Year,X048,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
8,1998,-4,Albania,ALB,Not asked,0.0,0.0,0.000000,0.005005,0.957958,0.032032,0.003003,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
8,2002,-4,Albania,ALB,Not asked,0.0,0.0,0.000000,0.002000,0.959000,0.031000,0.008000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
8,2008,-4,Albania,ALB,Not asked,0.0,0.0,0.003911,0.000000,0.906128,0.086701,0.003259,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
12,2002,-4,Algeria,DZA,Not asked,0.0,0.0,0.000000,0.003900,0.943838,0.046802,0.004680,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
12,2014,-4,Algeria,DZA,Not asked,0.0,0.0,0.000833,0.003333,0.930833,0.041667,0.011667,...,0.0,0.104167,0.020833,0.031667,0.0325,0.0925,0.0275,0.0375,0.025,0.023333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909,1990,-5,North Ireland,Nan,Unknown,0.0,0.0,0.000000,0.000000,0.944079,0.042763,0.009868,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
909,1999,-4,North Ireland,Nan,Not asked,0.0,0.0,0.003000,0.000000,0.879000,0.094000,0.017000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
909,2008,-4,North Ireland,Nan,Not asked,0.0,0.0,0.000000,0.000000,0.934000,0.050000,0.006000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
914,1998,-4,Bosnia,,Not asked,0.0,0.0,0.000000,0.000000,0.967500,0.030000,0.001250,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000


### Flat the index

In [44]:
grouped_by_country_region = grouped_by_country_region.reset_index()
grouped_by_country_region

Unnamed: 0,S003,Year,X048,Country Name,Country ISO3,Region,A001_-5,A001_-4,A001_-2,A001_-1,...,X053_-1,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9
0,8,1998,-4,Albania,ALB,Not asked,0.0,0.0,0.000000,0.005005,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
1,8,2002,-4,Albania,ALB,Not asked,0.0,0.0,0.000000,0.002000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
2,8,2008,-4,Albania,ALB,Not asked,0.0,0.0,0.003911,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
3,12,2002,-4,Algeria,DZA,Not asked,0.0,0.0,0.000000,0.003900,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
4,12,2014,-4,Algeria,DZA,Not asked,0.0,0.0,0.000833,0.003333,...,0.0,0.104167,0.020833,0.031667,0.0325,0.0925,0.0275,0.0375,0.025,0.023333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1153,909,1990,-5,North Ireland,Nan,Unknown,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
1154,909,1999,-4,North Ireland,Nan,Not asked,0.0,0.0,0.003000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
1155,909,2008,-4,North Ireland,Nan,Not asked,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
1156,914,1998,-4,Bosnia,,Not asked,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000


In [45]:
grouped_by_country_region.drop(["S003", "X048"], inplace = True, axis = 1)

In [46]:
grouped_by_country_region

Unnamed: 0,Year,Country Name,Country ISO3,Region,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,...,X053_-1,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9
0,1998,Albania,ALB,Not asked,0.0,0.0,0.000000,0.005005,0.957958,0.032032,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
1,2002,Albania,ALB,Not asked,0.0,0.0,0.000000,0.002000,0.959000,0.031000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
2,2008,Albania,ALB,Not asked,0.0,0.0,0.003911,0.000000,0.906128,0.086701,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
3,2002,Algeria,DZA,Not asked,0.0,0.0,0.000000,0.003900,0.943838,0.046802,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
4,2014,Algeria,DZA,Not asked,0.0,0.0,0.000833,0.003333,0.930833,0.041667,...,0.0,0.104167,0.020833,0.031667,0.0325,0.0925,0.0275,0.0375,0.025,0.023333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1153,1990,North Ireland,Nan,Unknown,0.0,0.0,0.000000,0.000000,0.944079,0.042763,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
1154,1999,North Ireland,Nan,Not asked,0.0,0.0,0.003000,0.000000,0.879000,0.094000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
1155,2008,North Ireland,Nan,Not asked,0.0,0.0,0.000000,0.000000,0.934000,0.050000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000
1156,1998,Bosnia,,Not asked,0.0,0.0,0.000000,0.000000,0.967500,0.030000,...,0.0,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.000,0.000000


### Save to csv

In [47]:
grouped_by_country_region.to_csv("out/IVS_grouped_by_country_region.csv", index = False)