In [1]:
import numpy as np
import pandas as pd

# Integrated Data

In [2]:
%%time
data = pd.read_csv("input/Integrated_data.csv", low_memory = False)

CPU times: user 4min 5s, sys: 1min 31s, total: 5min 37s
Wall time: 7min 7s


# Load up the list of variables we want to aggregate

In [3]:
variables = pd.read_csv("input/IVS_Variable_List.csv")

In [4]:
variables

Unnamed: 0,Included,Name,Label,Last EVS,Last WVS,Categories
0,1,A001,Important in life: Family,EVS 2008,WVS6 (2010-2012),1:Very important\n2:Rather important\n3:Not ve...
1,1,A002,Important in life: Friends,EVS 2008,WVS6 (2010-2012),1:Very important\n2:Rather important\n3:Not ve...
2,0,A003,Important in life: Leisure time,EVS 2008,WVS6 (2010-2012),-5:Missing; Unknown\n-4:Not asked in survey\n-...
3,0,A004,Important in life: Politics,EVS 2008,WVS6 (2010-2012),1:Very important\n2:Rather important\n3:Not ve...
4,1,A005,Important in life: Work,EVS 2008,WVS6 (2010-2012),1:Very important\n2:Rather important\n3:Not ve...
...,...,...,...,...,...,...
1422,0,w007,Spouse/partner had/has how many employees,EVS 2008,,1:None\n2:1-9\n3:10-24\n4:25 or more\n-5:Missi...
1423,0,w008,Does spouse/partner supervise someone,EVS 2008,,0:No\n1:Yes\n-5:Missing; Unknown\n-4:Not asked...
1424,0,w009,How many people does she/he supervise,EVS 2008,,1:1-9\n2:10-24\n3:25 or more\n-5:Missing; Unkn...
1425,0,w010,Spouse/partner experienced unemployment longer...,EVS 2008,,0:No\n1:Yes\n-5:Missing; Unknown\n-4:Not asked...


In [6]:
# lets keep only the ones we are interested in 
variables = variables[variables.Included == 1]

In [7]:
variables

Unnamed: 0,Included,Name,Label,Last EVS,Last WVS,Categories
0,1,A001,Important in life: Family,EVS 2008,WVS6 (2010-2012),1:Very important\n2:Rather important\n3:Not ve...
1,1,A002,Important in life: Friends,EVS 2008,WVS6 (2010-2012),1:Very important\n2:Rather important\n3:Not ve...
4,1,A005,Important in life: Work,EVS 2008,WVS6 (2010-2012),1:Very important\n2:Rather important\n3:Not ve...
8,1,A009,State of health (subjective),EVS 2008,WVS6 (2010-2012),1:Very good\n2:Good\n3:Fair\n4:Poor\n5:Very po...
29,1,A029,Important child qualities: independence,EVS 2008,WVS6 (2010-2012),0:Not mentioned\n1:Important\n-5:Missing; Unkn...
...,...,...,...,...,...,...
1354,1,X049,Size of town,EVS 2008,WVS6 (2010-2012),"1:2,000 and less\n2:2,000-5,000\n3:5,000-10,00..."
1357,1,X051,Ethnic group,EVS 1981,WVS6 (2010-2012),(*) See annexe
1358,1,X052,Institution of occupation,,WVS6 (2010-2012),1:Public institution\n2:Private business\n3:Pr...
1359,1,X053,Nature of tasks: manual vs. Cognitive,,WVS6 (2010-2012),1:Mostly manual tasks\n2:2\n3:3\n4:4\n5:5\n6:6...


# Slice the dataframe and preprocess some columns

In [14]:
data_subset = data.loc[: , data.columns.isin(variables.Name)].copy()

### Merge S002 with S002EVS

In [57]:
WVS_wave = pd.read_csv("input/WVS_wave.csv", index_col = "code").wave.to_dict()
EVS_wave = pd.read_csv("input/EVS_wave.csv", index_col = "code").wave.to_dict()

WVS_wave[-4] = ""
EVS_wave[-4] = ""

In [58]:
for key in WVS_wave:
    data_subset.loc[data_subset['S002'] == key, ['S002']] = WVS_wave[key]
    
for key in EVS_wave:
    data_subset.loc[data_subset['S002EVS'] == key, ['S002EVS']] = EVS_wave[key]
    
data_subset.S002 += data_subset.S002EVS

data_subset.drop(['S002EVS'], axis = 1, inplace = True)

### Change labels in S001

In [59]:
data_subset.loc[data_subset['S001'] == 1, ['S001']] = "EVS"
data_subset.loc[data_subset['S001'] == 2, ['S001']] = "WVS"

data_subset.rename(columns={'S001':'Survey'}, inplace=True)

### Fix an error on X048WVS 

In [60]:
# there are some strange values who do not appear in the list of region codes
data_subset[(data_subset.X048WVS > 0) & (data_subset.X048WVS < 100)].X048WVS.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [61]:
# they all belong to CHILE
data_subset[(data_subset.X048WVS > 0) & (data_subset.X048WVS < 100)].S003.unique()

array([152])

In [62]:
# CHILE has region code 152xxx, lets fix it
data_subset.loc[(data_subset.X048WVS > 0) & (data_subset.X048WVS < 100), "X048WVS"] += 152000


In [63]:
# there is a strange region in sweden 
data_subset[ data_subset.X048WVS == 752208 ].head()

Unnamed: 0,Survey,S002,S003,S017,A001,A002,A005,A009,A029,A032,...,X047,X048,X048WVS,X049,X051,X052,X053,x048a_n1,x048b_n2,x048c_n3
103360,WVS,1994-1998,752,1.0,2,1,1,1,0,0,...,2,-4,752208,-4,-4,-4,-4,,,
103368,WVS,1994-1998,752,1.0,1,1,1,2,1,1,...,7,-4,752208,-4,-4,-4,-4,,,
103370,WVS,1994-1998,752,1.0,1,1,1,2,1,1,...,-1,-4,752208,-4,-4,-4,-4,,,
103371,WVS,1994-1998,752,1.0,1,2,2,1,0,1,...,2,-4,752208,-4,-4,-4,-4,,,
103372,WVS,1994-1998,752,1.0,2,3,2,3,1,1,...,-1,-4,752208,-4,-4,-4,-4,,,


In [64]:
# convert 752208 to 752028
data_subset.loc[ data_subset.X048WVS == 752208, "X048WVS" ] = 752028

### Merge X048 with X048WVS

In [65]:
# check whether we have a row with valid values in both X048 and X048WVS
data_subset[(data_subset['X048'] > 0) & (data_subset['X048WVS'] > 0)]

Unnamed: 0,Survey,S002,S003,S017,A001,A002,A005,A009,A029,A032,...,X047,X048,X048WVS,X049,X051,X052,X053,x048a_n1,x048b_n2,x048c_n3


In [66]:
# nope, we can merge the two columns
# convert missing data to 0 in order to be able to sum
data_subset.loc[data_subset['X048'] < 0, ['X048']] = 0
data_subset.loc[data_subset['X048WVS'] < 0, ['X048WVS']] = 0

data_subset.X048 += data_subset.X048WVS
data_subset.drop(['X048WVS'], axis = 1, inplace = True)

# Reconvert to -5 
data_subset.loc[data_subset['X048'] == 0, ['X048']] = -5

### Rename some columns

In [67]:
data_subset.rename(columns={'S002':'Wave'}, inplace=True)
data_subset.rename(columns={'S003':'Country Numeric'}, inplace=True)
data_subset.rename(columns={'X048':'Region Numeric'}, inplace=True)
data_subset.rename(columns={'S017':'Weight'}, inplace=True)


In [68]:
data_subset

Unnamed: 0,Survey,Wave,Country Numeric,Weight,A001,A002,A005,A009,A029,A032,...,X045,X047,Region Numeric,X049,X051,X052,X053,x048a_n1,x048b_n2,x048c_n3
0,WVS,1981-1984,32,0.926626,-4,-4,-4,4,0,1,...,-2,-4,32001,-4,-4,-4,-4,,,
1,WVS,1981-1984,32,0.926626,-4,-4,-4,4,0,0,...,2,-4,32001,-4,-4,-4,-4,,,
2,WVS,1981-1984,32,0.926626,-4,-4,-4,3,0,0,...,2,-4,32001,-4,-4,-4,-4,,,
3,WVS,1981-1984,32,1.352874,-4,-4,-4,2,1,1,...,2,-4,32001,-4,-4,-4,-4,,,
4,WVS,1981-1984,32,0.926626,-4,-4,-4,2,1,0,...,3,-4,32001,-4,-4,-4,-4,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513524,EVS,2008-2010,915,0.782164,1,1,1,2,1,1,...,-4,-4,-5,-5,-4,-4,-4,RS-KM0,RS-KM00,-5
513525,EVS,2008-2010,915,0.887335,1,2,2,4,0,1,...,-4,-4,-5,-5,-4,-4,-4,RS-KM0,RS-KM00,-5
513526,EVS,2008-2010,915,0.782164,1,2,2,2,1,0,...,-4,-4,-5,-5,-4,-4,-4,RS-KM0,RS-KM00,-5
513527,EVS,2008-2010,915,1.201044,1,1,1,3,0,1,...,-4,-4,-5,-5,-4,-4,-4,RS-KM0,RS-KM00,-5


# One-hot encoding

In [106]:
not_to_encode = ["Survey", "Wave", "Country Numeric", "Region Numeric", 
                 "Weight", "x048a_n1", "x048b_n2", "x048c_n3"] # are the variables we want to groupby and the weights (S017)

In [107]:
one_hot = pd.get_dummies(data=data_subset.loc[: , [c for c in data_subset.columns if c not in not_to_encode] ], 
                           columns=[c for c in data_subset.columns if c not in not_to_encode])

In [108]:
one_hot

Unnamed: 0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513524,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
513525,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
513526,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
513527,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Multiply by the weights

In [109]:
weighted_one_hot = one_hot.mul(data_subset["Weight"], axis=0).copy()

In [110]:
weighted_one_hot

Unnamed: 0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,0.0,0.926626,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.926626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.926626,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.926626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.926626,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.926626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.352874,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.352874,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.926626,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.926626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513524,0.0,0.000000,0.0,0.0,0.782164,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
513525,0.0,0.000000,0.0,0.0,0.887335,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
513526,0.0,0.000000,0.0,0.0,0.782164,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
513527,0.0,0.000000,0.0,0.0,1.201044,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Groupby Country

### For country statistics we can correct the numbers using S007

In [111]:
weighted_one_hot["Survey"] = data_subset["Survey"]
weighted_one_hot["Wave"] = data_subset["Wave"]
weighted_one_hot["Country Numeric"] = data_subset["Country Numeric"]

In [112]:
grouped_by_country = weighted_one_hot.groupby(["Survey", "Wave", "Country Numeric"]).mean().copy()

In [113]:
grouped_by_country["Sample Size"] = weighted_one_hot.groupby(["Survey", "Wave", "Country Numeric"]).size()

In [114]:
grouped_by_country

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10,Sample Size
Survey,Wave,Country Numeric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
EVS,1981-1984,56,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1145
EVS,1981-1984,124,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1254
EVS,1981-1984,208,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1182
EVS,1981-1984,250,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1200
EVS,1981-1984,276,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WVS,2010-2014,818,0.0,0.0,0.000000,0.000,0.973568,0.021512,0.004920,0.000000,0.0,0.0,...,0.049620,0.073058,0.045724,0.052922,0.037831,0.046540,0.063399,0.017063,0.032774,1523
WVS,2010-2014,840,0.0,0.0,0.003548,0.000,0.909207,0.073387,0.007944,0.005945,0.0,0.0,...,0.043508,0.054001,0.043248,0.152357,0.101210,0.100090,0.122997,0.110341,0.136770,2232
WVS,2010-2014,858,0.0,0.0,0.000000,0.001,0.887000,0.099000,0.006000,0.007000,0.0,0.0,...,0.104000,0.062000,0.057000,0.130000,0.037000,0.040000,0.065000,0.045000,0.064000,1000
WVS,2010-2014,860,0.0,0.0,0.000000,0.000,0.975333,0.021333,0.003333,0.000000,0.0,0.0,...,0.094667,0.078667,0.075333,0.078667,0.061333,0.031333,0.058000,0.042000,0.094667,1500


### Lets add the country names

In [115]:
country_ISO = pd.read_csv("input/ISO_3611.csv", index_col = "numeric")

In [116]:
country_ISO

Unnamed: 0_level_0,Country,alpha-2,alpha-3
numeric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,Afghanistan,AF,AFG
8,Albania,AL,ALB
12,Algeria,DZ,DZA
16,American Samoa,AS,ASM
20,Andorra,AD,AND
...,...,...,...
101,Republika Srpska,,
197,Northern Cyprus,,
909,North Ireland,,Nan
914,Bosnia,,


In [117]:
names = []
ISO3 = []

for idx in grouped_by_country.index:
    names.append(country_ISO.loc[idx[2], "Country"])
    ISO3.append(country_ISO.loc[idx[2], "alpha-3"])

In [118]:
grouped_by_country["Country Name"] = names
grouped_by_country["Country ISO3"] = ISO3

In [119]:
grouped_by_country

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10,Sample Size,Country Name,Country ISO3
Survey,Wave,Country Numeric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
EVS,1981-1984,56,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1145,Belgium,BEL
EVS,1981-1984,124,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1254,Canada,CAN
EVS,1981-1984,208,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1182,Denmark,DNK
EVS,1981-1984,250,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1200,France,FRA
EVS,1981-1984,276,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1305,Germany,DEU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WVS,2010-2014,818,0.0,0.0,0.000000,0.000,0.973568,0.021512,0.004920,0.000000,0.0,0.0,...,0.045724,0.052922,0.037831,0.046540,0.063399,0.017063,0.032774,1523,Egypt,EGY
WVS,2010-2014,840,0.0,0.0,0.003548,0.000,0.909207,0.073387,0.007944,0.005945,0.0,0.0,...,0.043248,0.152357,0.101210,0.100090,0.122997,0.110341,0.136770,2232,United States of America (the),USA
WVS,2010-2014,858,0.0,0.0,0.000000,0.001,0.887000,0.099000,0.006000,0.007000,0.0,0.0,...,0.057000,0.130000,0.037000,0.040000,0.065000,0.045000,0.064000,1000,Uruguay,URY
WVS,2010-2014,860,0.0,0.0,0.000000,0.000,0.975333,0.021333,0.003333,0.000000,0.0,0.0,...,0.075333,0.078667,0.061333,0.031333,0.058000,0.042000,0.094667,1500,Uzbekistan,UZB


### Flat the index

In [120]:
grouped_by_country = grouped_by_country.reset_index()

In [121]:
grouped_by_country

Unnamed: 0,Survey,Wave,Country Numeric,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,...,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10,Sample Size,Country Name,Country ISO3
0,EVS,1981-1984,56,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1145,Belgium,BEL
1,EVS,1981-1984,124,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1254,Canada,CAN
2,EVS,1981-1984,208,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1182,Denmark,DNK
3,EVS,1981-1984,250,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1200,France,FRA
4,EVS,1981-1984,276,0.0,1.0,0.000000,0.000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1305,Germany,DEU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,WVS,2010-2014,818,0.0,0.0,0.000000,0.000,0.973568,0.021512,0.004920,...,0.045724,0.052922,0.037831,0.046540,0.063399,0.017063,0.032774,1523,Egypt,EGY
362,WVS,2010-2014,840,0.0,0.0,0.003548,0.000,0.909207,0.073387,0.007944,...,0.043248,0.152357,0.101210,0.100090,0.122997,0.110341,0.136770,2232,United States of America (the),USA
363,WVS,2010-2014,858,0.0,0.0,0.000000,0.001,0.887000,0.099000,0.006000,...,0.057000,0.130000,0.037000,0.040000,0.065000,0.045000,0.064000,1000,Uruguay,URY
364,WVS,2010-2014,860,0.0,0.0,0.000000,0.000,0.975333,0.021333,0.003333,...,0.075333,0.078667,0.061333,0.031333,0.058000,0.042000,0.094667,1500,Uzbekistan,UZB


### Move the last columns to the front

In [122]:
columns = list(grouped_by_country.columns)

In [123]:
grouped_by_country = grouped_by_country[columns[:3] + columns[-2:] + [columns[-3]] + columns[3:-3]]

In [124]:
grouped_by_country

Unnamed: 0,Survey,Wave,Country Numeric,Country Name,Country ISO3,Sample Size,A001_-5,A001_-4,A001_-2,A001_-1,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,EVS,1981-1984,56,Belgium,BEL,1145,0.0,1.0,0.000000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,EVS,1981-1984,124,Canada,CAN,1254,0.0,1.0,0.000000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,EVS,1981-1984,208,Denmark,DNK,1182,0.0,1.0,0.000000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,EVS,1981-1984,250,France,FRA,1200,0.0,1.0,0.000000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,EVS,1981-1984,276,Germany,DEU,1305,0.0,1.0,0.000000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,WVS,2010-2014,818,Egypt,EGY,1523,0.0,0.0,0.000000,0.000,...,0.080814,0.049620,0.073058,0.045724,0.052922,0.037831,0.046540,0.063399,0.017063,0.032774
362,WVS,2010-2014,840,United States of America (the),USA,2232,0.0,0.0,0.003548,0.000,...,0.106743,0.043508,0.054001,0.043248,0.152357,0.101210,0.100090,0.122997,0.110341,0.136770
363,WVS,2010-2014,858,Uruguay,URY,1000,0.0,0.0,0.000000,0.001,...,0.329000,0.104000,0.062000,0.057000,0.130000,0.037000,0.040000,0.065000,0.045000,0.064000
364,WVS,2010-2014,860,Uzbekistan,UZB,1500,0.0,0.0,0.000000,0.000,...,0.237333,0.094667,0.078667,0.075333,0.078667,0.061333,0.031333,0.058000,0.042000,0.094667


### Save to csv

In [125]:
grouped_by_country.to_csv("out/IVS_grouped_by_country.csv", index = False, encoding = 'utf-16')

# Groupby Country and Region

### For regional aggregation we can not use the weighted answers

In [153]:
one_hot["Survey"] = data_subset["Survey"]
one_hot["Wave"] = data_subset["Wave"]
one_hot["Country Numeric"] = data_subset["Country Numeric"]
one_hot["Region Numeric"] = data_subset["Region Numeric"]

one_hot["NUTS1"] = data_subset["x048a_n1"]
one_hot["NUTS2"] = data_subset["x048b_n2"]
one_hot["NUTS3"] = data_subset["x048c_n3"]

In [154]:
grouped_by_country_region = one_hot.groupby(["Survey", "Wave", "Country Numeric", 
                                             "Region Numeric", "NUTS1", "NUTS2", 
                                             "NUTS3"]).mean().copy()

In [155]:
grouped_by_country_region["Sample Size"] = one_hot.groupby(["Survey", "Wave", "Country Numeric",
                                                            "Region Numeric", "NUTS1", "NUTS2", "NUTS3"]).size()

In [156]:
grouped_by_country_region

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10,Sample Size
Survey,Wave,Country Numeric,Region Numeric,NUTS1,NUTS2,NUTS3,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
EVS,1981-1984,56,56021,-4,-4,-4,0.0,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,1.0,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6
EVS,1981-1984,56,56022,-4,-4,-4,0.0,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,1.0,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,175
EVS,1981-1984,56,56023,-4,-4,-4,0.0,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,1.0,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,109
EVS,1981-1984,56,56024,-4,-4,-4,0.0,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,1.0,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,139
EVS,1981-1984,56,56025,-4,-4,-4,0.0,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,1.0,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WVS,2010-2014,887,887015,,,,0.0,0.0,0.0,0.0,1.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.00,0.076923,0.230769,0.076923,0.000000,0.076923,0.000000,13
WVS,2010-2014,887,887016,,,,0.0,0.0,0.0,0.0,1.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.115385,0.000000,26
WVS,2010-2014,887,887017,,,,0.0,0.0,0.0,0.0,0.977273,0.000000,0.022727,0.0,0.0,0.0,...,0.022727,0.045455,0.00,0.022727,0.022727,0.045455,0.000000,0.000000,0.068182,44
WVS,2010-2014,887,887018,,,,0.0,0.0,0.0,0.0,1.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.040000,0.040000,0.16,0.000000,0.000000,0.040000,0.000000,0.000000,0.080000,25


### Add the country and region names

In [157]:
region_codes = pd.read_csv("input/Region_codes.csv", index_col="Code")

In [158]:
region_codes

Unnamed: 0_level_0,Region
Code,Unnamed: 1_level_1
-5,Unknown
-4,Not asked
-3,Not applicable
-2,No answer
-1,Don't know
...,...
1701040,CO: Oriental
7360010,SD: white nile
7360011,SD: north darfur
7360012,SD: west kurdufan


In [159]:
region_name = []
r_names = []
r_ISO3 = []


for idx in grouped_by_country_region.index:
    region_name.append(region_codes.loc[idx[3], "Region"])
    r_names.append(country_ISO.loc[idx[2], "Country"])
    r_ISO3.append(country_ISO.loc[idx[2], "alpha-3"])
    

In [160]:
grouped_by_country_region["Country Name"] = r_names
grouped_by_country_region["Country ISO3"] = r_ISO3
grouped_by_country_region["Region Name"] = region_name

### Flat the index

In [161]:
grouped_by_country_region = grouped_by_country_region.reset_index()
grouped_by_country_region

Unnamed: 0,Survey,Wave,Country Numeric,Region Numeric,NUTS1,NUTS2,NUTS3,A001_-5,A001_-4,A001_-2,...,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10,Sample Size,Country Name,Country ISO3,Region Name
0,EVS,1981-1984,56,56021,-4,-4,-4,0.0,1.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6,Belgium,BEL,BE: unknown
1,EVS,1981-1984,56,56022,-4,-4,-4,0.0,1.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,175,Belgium,BEL,BE: unknown
2,EVS,1981-1984,56,56023,-4,-4,-4,0.0,1.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,109,Belgium,BEL,BE: unknown
3,EVS,1981-1984,56,56024,-4,-4,-4,0.0,1.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,139,Belgium,BEL,BE: unknown
4,EVS,1981-1984,56,56025,-4,-4,-4,0.0,1.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,163,Belgium,BEL,BE: unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3903,WVS,2010-2014,887,887015,,,,0.0,0.0,0.0,...,0.076923,0.230769,0.076923,0.000000,0.076923,0.000000,13,Yemen,YEM,YE: Mareb
3904,WVS,2010-2014,887,887016,,,,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.115385,0.000000,26,Yemen,YEM,YE: Al-Mahwit
3905,WVS,2010-2014,887,887017,,,,0.0,0.0,0.0,...,0.022727,0.022727,0.045455,0.000000,0.000000,0.068182,44,Yemen,YEM,YE: Amran
3906,WVS,2010-2014,887,887018,,,,0.0,0.0,0.0,...,0.000000,0.000000,0.040000,0.000000,0.000000,0.080000,25,Yemen,YEM,YE: Ad-dale'


### Move the last column to the front

In [166]:
columns_r = list(grouped_by_country_region.columns)

In [167]:
grouped_by_country_region = grouped_by_country_region[columns_r[:3] + columns_r[-2:-1]
                                                      + [columns_r[3]]+ [columns_r[-1]]
                                                      + columns_r[4:7]
                                                      + [columns_r[-4]]+ columns_r[7:-4]]

In [168]:
grouped_by_country_region

Unnamed: 0,Survey,Wave,Country Numeric,Country ISO3,Region Numeric,Region Name,NUTS1,NUTS2,NUTS3,Sample Size,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,EVS,1981-1984,56,BEL,56021,BE: unknown,-4,-4,-4,6,...,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,EVS,1981-1984,56,BEL,56022,BE: unknown,-4,-4,-4,175,...,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,EVS,1981-1984,56,BEL,56023,BE: unknown,-4,-4,-4,109,...,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,EVS,1981-1984,56,BEL,56024,BE: unknown,-4,-4,-4,139,...,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,EVS,1981-1984,56,BEL,56025,BE: unknown,-4,-4,-4,163,...,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3903,WVS,2010-2014,887,YEM,887015,YE: Mareb,,,,13,...,0.000000,0.000000,0.000000,0.00,0.076923,0.230769,0.076923,0.000000,0.076923,0.000000
3904,WVS,2010-2014,887,YEM,887016,YE: Al-Mahwit,,,,26,...,0.307692,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.115385,0.000000
3905,WVS,2010-2014,887,YEM,887017,YE: Amran,,,,44,...,0.227273,0.022727,0.045455,0.00,0.022727,0.022727,0.045455,0.000000,0.000000,0.068182
3906,WVS,2010-2014,887,YEM,887018,YE: Ad-dale',,,,25,...,0.160000,0.040000,0.040000,0.16,0.000000,0.000000,0.040000,0.000000,0.000000,0.080000


### Save to csv

In [169]:
grouped_by_country_region.to_csv("out/IVS_grouped_by_country_region.csv", index = False, encoding = 'utf-16')

In [171]:
grouped_by_country_region[["Survey", "Wave", "Country Numeric", "Country ISO3", 
                           "Region Numeric", "Region Name", 
                           "NUTS1", "NUTS2", "NUTS3",
                           "Sample Size"]].to_csv("out/regions_wave.csv",
                                                                                   index = False, 
                                                                                   encoding = 'utf-16')