# INFO

This notebook contains all the required code to fetch, process and merge data in order to populate the country_statistics table.

Each section can be run independently since the required files are stored in /temp

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>

# Integrated Value Survey

In [1]:
import numpy as np
import pandas as pd

## Read the data

In [3]:
%%time
IVS_data = pd.read_csv("input/IVS/Integrated_data.csv", low_memory = False)

CPU times: user 4min 6s, sys: 1min 23s, total: 5min 30s
Wall time: 8min


### Load up the list of variables we want to aggregate

In [4]:
IVS_variables = pd.read_csv("input/IVS/IVS_Variable_List.csv")

In [5]:
# lets keep only the ones we are interested in 
IVS_variables = IVS_variables[IVS_variables.Included == 1]

In [9]:
len(IVS_variables)

86

In [8]:
IVS_variables.head()

Unnamed: 0,Included,Name,Label,Last EVS,Last WVS,Categories
0,1,A001,Important in life: Family,EVS 2008,WVS6 (2010-2012),1:Very important\n2:Rather important\n3:Not ve...
1,1,A002,Important in life: Friends,EVS 2008,WVS6 (2010-2012),1:Very important\n2:Rather important\n3:Not ve...
4,1,A005,Important in life: Work,EVS 2008,WVS6 (2010-2012),1:Very important\n2:Rather important\n3:Not ve...
8,1,A009,State of health (subjective),EVS 2008,WVS6 (2010-2012),1:Very good\n2:Good\n3:Fair\n4:Poor\n5:Very po...
29,1,A029,Important child qualities: independence,EVS 2008,WVS6 (2010-2012),0:Not mentioned\n1:Important\n-5:Missing; Unkn...


## Slice the dataframe and preprocess some columns

In [10]:
IVS_data_subset = IVS_data.loc[: , IVS_data.columns.isin(IVS_variables.Name)].copy()

### Merge S002 with S002EVS

In [11]:
WVS_wave = pd.read_csv("input/IVS/WVS_wave.csv", index_col = "code").wave.to_dict()
EVS_wave = pd.read_csv("input/IVS/EVS_wave.csv", index_col = "code").wave.to_dict()

WVS_wave[-4] = ""
EVS_wave[-4] = ""

In [12]:
for key in WVS_wave:
    IVS_data_subset.loc[IVS_data_subset['S002'] == key, ['S002']] = WVS_wave[key]
    
for key in EVS_wave:
    IVS_data_subset.loc[IVS_data_subset['S002EVS'] == key, ['S002EVS']] = EVS_wave[key]
    
IVS_data_subset.S002 += IVS_data_subset.S002EVS

IVS_data_subset.drop(['S002EVS'], axis = 1, inplace = True)

### Change labels in S001

In [13]:
IVS_data_subset.loc[IVS_data_subset['S001'] == 1, ['S001']] = "EVS"
IVS_data_subset.loc[IVS_data_subset['S001'] == 2, ['S001']] = "WVS"

IVS_data_subset.rename(columns={'S001':'Survey'}, inplace=True)

### Fix an error on X048WVS 

In [14]:
# there are some strange values who do not appear in the list of region codes
IVS_data_subset[(IVS_data_subset.X048WVS > 0) & (IVS_data_subset.X048WVS < 100)].X048WVS.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [15]:
# they all belong to CHILE
IVS_data_subset[(IVS_data_subset.X048WVS > 0) & (IVS_data_subset.X048WVS < 100)].S003.unique()

array([152])

In [16]:
# CHILE has region code 152xxx, lets fix it
IVS_data_subset.loc[(IVS_data_subset.X048WVS > 0) & (IVS_data_subset.X048WVS < 100), "X048WVS"] += 152000

In [17]:
# there is a strange region in Sweden 
IVS_data_subset[ IVS_data_subset.X048WVS == 752208 ].head()

Unnamed: 0,Survey,S002,S003,S017,A001,A002,A005,A009,A029,A032,...,X047,X048,X048WVS,X049,X051,X052,X053,x048a_n1,x048b_n2,x048c_n3
103360,WVS,1994-1998,752,1.0,2,1,1,1,0,0,...,2,-4,752208,-4,-4,-4,-4,,,
103368,WVS,1994-1998,752,1.0,1,1,1,2,1,1,...,7,-4,752208,-4,-4,-4,-4,,,
103370,WVS,1994-1998,752,1.0,1,1,1,2,1,1,...,-1,-4,752208,-4,-4,-4,-4,,,
103371,WVS,1994-1998,752,1.0,1,2,2,1,0,1,...,2,-4,752208,-4,-4,-4,-4,,,
103372,WVS,1994-1998,752,1.0,2,3,2,3,1,1,...,-1,-4,752208,-4,-4,-4,-4,,,


In [18]:
# convert 752208 to 752028
IVS_data_subset.loc[ IVS_data_subset.X048WVS == 752208, "X048WVS" ] = 752028

### Merge X048 with X048WVS

In [19]:
# check whether we have a row with valid values in both X048 and X048WVS
IVS_data_subset[(IVS_data_subset['X048'] > 0) & (IVS_data_subset['X048WVS'] > 0)]

Unnamed: 0,Survey,S002,S003,S017,A001,A002,A005,A009,A029,A032,...,X047,X048,X048WVS,X049,X051,X052,X053,x048a_n1,x048b_n2,x048c_n3


In [20]:
# nope, we can merge the two columns
# convert missing data to 0 in order to be able to sum
IVS_data_subset.loc[IVS_data_subset['X048'] < 0, ['X048']] = 0
IVS_data_subset.loc[IVS_data_subset['X048WVS'] < 0, ['X048WVS']] = 0

IVS_data_subset.X048 += IVS_data_subset.X048WVS
IVS_data_subset.drop(['X048WVS'], axis = 1, inplace = True)

# Reconvert to -5 
IVS_data_subset.loc[IVS_data_subset['X048'] == 0, ['X048']] = -5

### Rename some columns

In [21]:
IVS_data_subset.rename(columns={'S002':'Wave'}, inplace=True)
IVS_data_subset.rename(columns={'S003':'Country Numeric'}, inplace=True)
IVS_data_subset.rename(columns={'X048':'Region Numeric'}, inplace=True)
IVS_data_subset.rename(columns={'S017':'Weight'}, inplace=True)

In [22]:
IVS_data_subset.head()

Unnamed: 0,Survey,Wave,Country Numeric,Weight,A001,A002,A005,A009,A029,A032,...,X045,X047,Region Numeric,X049,X051,X052,X053,x048a_n1,x048b_n2,x048c_n3
0,WVS,1981-1984,32,0.926626,-4,-4,-4,4,0,1,...,-2,-4,32001,-4,-4,-4,-4,,,
1,WVS,1981-1984,32,0.926626,-4,-4,-4,4,0,0,...,2,-4,32001,-4,-4,-4,-4,,,
2,WVS,1981-1984,32,0.926626,-4,-4,-4,3,0,0,...,2,-4,32001,-4,-4,-4,-4,,,
3,WVS,1981-1984,32,1.352874,-4,-4,-4,2,1,1,...,2,-4,32001,-4,-4,-4,-4,,,
4,WVS,1981-1984,32,0.926626,-4,-4,-4,2,1,0,...,3,-4,32001,-4,-4,-4,-4,,,


<br>
<br>
<br>
<br>
<br>

## One-hot encoding

In [23]:
not_to_encode = ["Survey", "Wave", "Country Numeric", "Region Numeric", 
                 "Weight", "x048a_n1", "x048b_n2", "x048c_n3"] 
# the variables we want to groupby and the weights (S017)

In [24]:
one_hot = pd.get_dummies(data=IVS_data_subset.loc[: , [c for c in IVS_data_subset.columns if c not in not_to_encode] ], 
                           columns=[c for c in IVS_data_subset.columns if c not in not_to_encode])

In [25]:
one_hot.head()

Unnamed: 0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Multiply by the weights

In [26]:
weighted_one_hot = one_hot.mul(IVS_data_subset["Weight"], axis=0).copy()

In [27]:
weighted_one_hot.head()

Unnamed: 0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,0.0,0.926626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.926626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.926626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.926626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.926626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.926626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.352874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.352874,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.926626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.926626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<br>
<br>
<br>
<br>
<br>

## Groupby Country

### For country statistics we can correct the numbers using S007

In [28]:
weighted_one_hot["Survey"] = IVS_data_subset["Survey"]
weighted_one_hot["Wave"] = IVS_data_subset["Wave"]
weighted_one_hot["Country Numeric"] = IVS_data_subset["Country Numeric"]

In [29]:
grouped_by_country = weighted_one_hot.groupby(["Survey", "Wave", "Country Numeric"]).mean().copy()

In [30]:
grouped_by_country["Sample Size"] = weighted_one_hot.groupby(["Survey", "Wave", "Country Numeric"]).size()

In [31]:
grouped_by_country.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A001_-5,A001_-4,A001_-2,A001_-1,A001_1,A001_2,A001_3,A001_4,A002_-5,A002_-4,...,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10,Sample Size
Survey,Wave,Country Numeric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
EVS,1981-1984,56,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1145
EVS,1981-1984,124,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1254
EVS,1981-1984,208,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1182
EVS,1981-1984,250,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1200
EVS,1981-1984,276,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1305


### Lets add the country names

In [32]:
country_ISO = pd.read_csv("input/IVS/ISO_3611.csv", index_col = "numeric")

In [33]:
country_ISO.head()

Unnamed: 0_level_0,Country,alpha-2,alpha-3
numeric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,Afghanistan,AF,AFG
8,Albania,AL,ALB
12,Algeria,DZ,DZA
16,American Samoa,AS,ASM
20,Andorra,AD,AND


In [34]:
names = []
ISO3 = []

for idx in grouped_by_country.index:
    names.append(country_ISO.loc[idx[2], "Country"])
    ISO3.append(country_ISO.loc[idx[2], "alpha-3"])

In [35]:
grouped_by_country["Country Name"] = names
grouped_by_country["Country ISO3"] = ISO3

### Flat the index

In [36]:
grouped_by_country = grouped_by_country.reset_index()

### Move the last columns to the front

In [37]:
columns = list(grouped_by_country.columns)

In [38]:
grouped_by_country = grouped_by_country[columns[:3] + columns[-2:] + [columns[-3]] + columns[3:-3]]

In [39]:
grouped_by_country.head()

Unnamed: 0,Survey,Wave,Country Numeric,Country Name,Country ISO3,Sample Size,A001_-5,A001_-4,A001_-2,A001_-1,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,EVS,1981-1984,56,Belgium,BEL,1145,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,EVS,1981-1984,124,Canada,CAN,1254,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,EVS,1981-1984,208,Denmark,DNK,1182,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EVS,1981-1984,250,France,FRA,1200,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EVS,1981-1984,276,Germany,DEU,1305,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# ATTENTION! Northern Ireland is separated by the rest of the UK!
grouped_by_country[grouped_by_country["Country ISO3"] == "GBR-NIR"]

Unnamed: 0,Survey,Wave,Country Numeric,Country Name,Country ISO3,Sample Size,A001_-5,A001_-4,A001_-2,A001_-1,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
15,EVS,1981-1984,909,North Ireland,GBR-NIR,312,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44,EVS,1990-1993,909,North Ireland,GBR-NIR,304,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77,EVS,1999-2001,909,North Ireland,GBR-NIR,1000,0.0,0.0,0.003159,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122,EVS,2008-2010,909,North Ireland,GBR-NIR,500,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save to pickle

In [41]:
grouped_by_country.to_pickle("temp/IVS_grouped_by_country.pkl")
print("saved to temp/IVS_grouped_by_country.pkl")

saved to temp/IVS_grouped_by_country.pkl


<br>
<br>
<br>
<br>
<br>

## Groupby Country and Region

### For regional aggregation we can not use the weighted answers

In [42]:
one_hot["Survey"] = IVS_data_subset["Survey"]
one_hot["Wave"] = IVS_data_subset["Wave"]
one_hot["Country Numeric"] = IVS_data_subset["Country Numeric"]
one_hot["Region Numeric"] = IVS_data_subset["Region Numeric"]

one_hot["NUTS1"] = IVS_data_subset["x048a_n1"]
one_hot["NUTS2"] = IVS_data_subset["x048b_n2"]
one_hot["NUTS3"] = IVS_data_subset["x048c_n3"]

In [43]:
grouped_by_country_region = one_hot.groupby(["Survey", "Wave", "Country Numeric", 
                                             "Region Numeric", "NUTS1", "NUTS2", 
                                             "NUTS3"]).mean().copy()

In [44]:
grouped_by_country_region["Sample Size"] = one_hot.groupby(["Survey", "Wave", "Country Numeric",
                                                            "Region Numeric", "NUTS1", "NUTS2", "NUTS3"]).size()

### Add the country and region names

In [45]:
region_codes = pd.read_csv("input/IVS/Region_codes.csv", index_col="Code")

In [46]:
region_codes.head()

Unnamed: 0_level_0,Region
Code,Unnamed: 1_level_1
-5,Unknown
-4,Not asked
-3,Not applicable
-2,No answer
-1,Don't know


In [47]:
region_name = []
r_names = []
r_ISO3 = []


for idx in grouped_by_country_region.index:
    region_name.append(region_codes.loc[idx[3], "Region"])
    r_names.append(country_ISO.loc[idx[2], "Country"])
    r_ISO3.append(country_ISO.loc[idx[2], "alpha-3"])
    

In [48]:
grouped_by_country_region["Country Name"] = r_names
grouped_by_country_region["Country ISO3"] = r_ISO3
grouped_by_country_region["Region Name"] = region_name

### Flat the index

In [49]:
grouped_by_country_region = grouped_by_country_region.reset_index()

### Rearrange the columns

In [50]:
columns_r = list(grouped_by_country_region.columns)

In [51]:
grouped_by_country_region = grouped_by_country_region[columns_r[:3] + columns_r[-3:-1]
                                                      + [columns_r[3]]+ [columns_r[-1]]
                                                      + columns_r[4:7]
                                                      + [columns_r[-4]]+ columns_r[7:-4]]

In [52]:
grouped_by_country_region.head()

Unnamed: 0,Survey,Wave,Country Numeric,Country Name,Country ISO3,Region Numeric,Region Name,NUTS1,NUTS2,NUTS3,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,EVS,1981-1984,56,Belgium,BEL,56021,BE: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,EVS,1981-1984,56,Belgium,BEL,56022,BE: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,EVS,1981-1984,56,Belgium,BEL,56023,BE: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EVS,1981-1984,56,Belgium,BEL,56024,BE: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EVS,1981-1984,56,Belgium,BEL,56025,BE: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
# ATTENTION! Northern Ireland is separated by the rest of the UK!
grouped_by_country_region[grouped_by_country_region["Country ISO3"] == "GBR-NIR"].head()

Unnamed: 0,Survey,Wave,Country Numeric,Country Name,Country ISO3,Region Numeric,Region Name,NUTS1,NUTS2,NUTS3,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
111,EVS,1981-1984,909,North Ireland,GBR-NIR,909001,NIR: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112,EVS,1981-1984,909,North Ireland,GBR-NIR,909002,NIR: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
113,EVS,1981-1984,909,North Ireland,GBR-NIR,909003,NIR: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114,EVS,1981-1984,909,North Ireland,GBR-NIR,909004,NIR: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115,EVS,1981-1984,909,North Ireland,GBR-NIR,909005,NIR: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save to pickle

In [54]:
grouped_by_country_region.to_pickle("temp/IVS_grouped_by_country_region.pkl")
print("saved to temp/IVS_grouped_by_country_region.pkl")

saved to temp/IVS_grouped_by_country_region.pkl


<br>
<br>
<br>
<br>
<br>

# Properties Dictionary
## We need to have the three tables in the same shape before concatenating them

In [55]:
import numpy as np
import pandas as pd
import json

# IVS Country Data

## Create dictionary with the IVS anwers descriptions

In [56]:
IVS_variables = pd.read_csv("input/IVS/IVS_Variable_List.csv")

In [57]:
IVS_variables = IVS_variables.loc[IVS_variables.Included == 1].copy()
IVS_variables.drop(["Included", "Last EVS", "Last WVS"], axis=1, inplace=True)

In [58]:
IVS_variables

Unnamed: 0,Name,Label,Categories
0,A001,Important in life: Family,1:Very important\n2:Rather important\n3:Not ve...
1,A002,Important in life: Friends,1:Very important\n2:Rather important\n3:Not ve...
4,A005,Important in life: Work,1:Very important\n2:Rather important\n3:Not ve...
8,A009,State of health (subjective),1:Very good\n2:Good\n3:Fair\n4:Poor\n5:Very po...
29,A029,Important child qualities: independence,0:Not mentioned\n1:Important\n-5:Missing; Unkn...
...,...,...,...
1349,x048c_n3,Region: NUTS-3 code,??
1354,X049,Size of town,"1:2,000 and less\n2:2,000-5,000\n3:5,000-10,00..."
1357,X051,Ethnic group,(*) See annexe
1358,X052,Institution of occupation,1:Public institution\n2:Private business\n3:Pr...


In [59]:
cat_dict_column = []

for idx, row in IVS_variables.iterrows():
    codes = {}
    for line in row.Categories.split("\n")[:-1]: # there is always a \n at the end
        try:
            key = line.split(":")[0]
            value = line.split(":")[1]
            codes[key] = value
        except:
            continue
    cat_dict_column.append(codes)

In [60]:
IVS_variables["Categories"] = cat_dict_column

In [61]:
IVS_variables.set_index("Name", inplace=True)

In [62]:
variables_dict = IVS_variables.to_dict(orient="index")

## We need to pop some keys
ZMEN_010 is not present in the dataset

the following ones are the values we aggregated on

S002
S002EVS
S001
S017

X048WVS
X048
x048a_n1
x048b_n2
x048c_n3

In [65]:
len(variables_dict.keys())

86

In [66]:
to_drop = ["S001", "S002", "S002EVS", "S003", "S017",
          "X048WVS", "X048", "x048a_n1", "x048b_n2", "x048c_n3"]

for key in to_drop:
    variables_dict.pop(key)

len(variables_dict.keys())

76

## Load Country data

In [67]:
# load the list of GID at the country level (admn_0)
country_gid = pd.read_csv("input/gid/admn_0.csv")
country_gid.head()

Unnamed: 0,country,countrycode,countrycode_alpha2,adm_level,adm_area_1,adm_area_1_code,adm_area_2,adm_area_2_code,adm_area_3,adm_area_3_code,gid
0,Aruba,ABW,AW,0,,,,,,,ABW
1,Anguilla,AIA,AI,0,,,,,,,AIA
2,Palestina,PSE,PS,0,,,,,,,PSE
3,Cyprus,CYP,CY,0,,,,,,,CYP
4,Micronesia,FSM,FM,0,,,,,,,FSM


In [68]:
# load the IVS data aggregated by country
IVS_country = pd.read_pickle("temp/IVS_grouped_by_country.pkl")
IVS_country.head()

Unnamed: 0,Survey,Wave,Country Numeric,Country Name,Country ISO3,Sample Size,A001_-5,A001_-4,A001_-2,A001_-1,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,EVS,1981-1984,56,Belgium,BEL,1145,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,EVS,1981-1984,124,Canada,CAN,1254,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,EVS,1981-1984,208,Denmark,DNK,1182,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EVS,1981-1984,250,France,FRA,1200,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EVS,1981-1984,276,Germany,DEU,1305,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
# check for strange missing countries
IVS_country[~ IVS_country["Country ISO3"].isin(country_gid.countrycode)]

Unnamed: 0,Survey,Wave,Country Numeric,Country Name,Country ISO3,Sample Size,A001_-5,A001_-4,A001_-2,A001_-1,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
15,EVS,1981-1984,909,North Ireland,GBR-NIR,312,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44,EVS,1990-1993,909,North Ireland,GBR-NIR,304,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77,EVS,1999-2001,909,North Ireland,GBR-NIR,1000,0.0,0.0,0.003159,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87,EVS,2008-2010,197,Northern Cyprus,,500,0.0,0.0,0.00739,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122,EVS,2008-2010,909,North Ireland,GBR-NIR,500,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123,EVS,2008-2010,915,Kosovo,,1601,0.0,0.0,0.008117,0.001159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
160,WVS,1994-1998,101,Republika Srpska,,400,0.0,0.0,0.0,0.005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207,WVS,1994-1998,914,Bosnia,,800,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
IVS_country_has_gid = IVS_country[IVS_country["Country ISO3"].isin(country_gid.countrycode)].copy()

In [71]:
IVS_country_has_gid['json'] = IVS_country_has_gid.apply(lambda x: x.to_dict(), axis=1)

In [72]:
IVS_country_has_gid

Unnamed: 0,Survey,Wave,Country Numeric,Country Name,Country ISO3,Sample Size,A001_-5,A001_-4,A001_-2,A001_-1,...,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10,json
0,EVS,1981-1984,56,Belgium,BEL,1145,0.0,1.0,0.000000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"{'Survey': 'EVS', 'Wave': '1981-1984', 'Countr..."
1,EVS,1981-1984,124,Canada,CAN,1254,0.0,1.0,0.000000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"{'Survey': 'EVS', 'Wave': '1981-1984', 'Countr..."
2,EVS,1981-1984,208,Denmark,DNK,1182,0.0,1.0,0.000000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"{'Survey': 'EVS', 'Wave': '1981-1984', 'Countr..."
3,EVS,1981-1984,250,France,FRA,1200,0.0,1.0,0.000000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"{'Survey': 'EVS', 'Wave': '1981-1984', 'Countr..."
4,EVS,1981-1984,276,Germany,DEU,1305,0.0,1.0,0.000000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"{'Survey': 'EVS', 'Wave': '1981-1984', 'Countr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,WVS,2010-2014,818,Egypt,EGY,1523,0.0,0.0,0.000000,0.000,...,0.049620,0.073058,0.045724,0.052922,0.037831,0.046540,0.063399,0.017063,0.032774,"{'Survey': 'WVS', 'Wave': '2010-2014', 'Countr..."
362,WVS,2010-2014,840,United States of America (the),USA,2232,0.0,0.0,0.003548,0.000,...,0.043508,0.054001,0.043248,0.152357,0.101210,0.100090,0.122997,0.110341,0.136770,"{'Survey': 'WVS', 'Wave': '2010-2014', 'Countr..."
363,WVS,2010-2014,858,Uruguay,URY,1000,0.0,0.0,0.000000,0.001,...,0.104000,0.062000,0.057000,0.130000,0.037000,0.040000,0.065000,0.045000,0.064000,"{'Survey': 'WVS', 'Wave': '2010-2014', 'Countr..."
364,WVS,2010-2014,860,Uzbekistan,UZB,1500,0.0,0.0,0.000000,0.000,...,0.094667,0.078667,0.075333,0.078667,0.061333,0.031333,0.058000,0.042000,0.094667,"{'Survey': 'WVS', 'Wave': '2010-2014', 'Countr..."


In [73]:
IVS_country_has_gid.json

0      {'Survey': 'EVS', 'Wave': '1981-1984', 'Countr...
1      {'Survey': 'EVS', 'Wave': '1981-1984', 'Countr...
2      {'Survey': 'EVS', 'Wave': '1981-1984', 'Countr...
3      {'Survey': 'EVS', 'Wave': '1981-1984', 'Countr...
4      {'Survey': 'EVS', 'Wave': '1981-1984', 'Countr...
                             ...                        
361    {'Survey': 'WVS', 'Wave': '2010-2014', 'Countr...
362    {'Survey': 'WVS', 'Wave': '2010-2014', 'Countr...
363    {'Survey': 'WVS', 'Wave': '2010-2014', 'Countr...
364    {'Survey': 'WVS', 'Wave': '2010-2014', 'Countr...
365    {'Survey': 'WVS', 'Wave': '2010-2014', 'Countr...
Name: json, Length: 358, dtype: object

In [74]:
import copy

# now we have a flat dict, we will need to organize it
columns = IVS_country_has_gid.columns

properties = []

for idx, row in IVS_country_has_gid.iterrows():
    new_dict = {}
    
    # for each row we create a dict
    # {..., 
    #  column_id: {label: 'description',
    #              frequencies: {dict with the anwers onehot encoded}
    #.             categories: {dict with the answers codes}}, 
    #  ...}
    
    # label and categories are from variables_dict
    
    for answer_code in variables_dict.keys():
        sub_dict = {}
        # we already have label and categories
        sub_dict = variables_dict[answer_code]
        
        # we need frequencies
        # in the row json we have a flat dict
        # we want to aggregate by name of the answer
        sub_dict["Frequencies"] = {}
        
        for key, item in row.json.items():
            if key.startswith(answer_code):
                sub_dict["Frequencies"][key] = item
        new_dict[answer_code] = sub_dict
       
    properties.append(copy.deepcopy(new_dict)) 


In [75]:
# add adm_area_* columns
IVS_country_has_gid["adm_area_1"] = None
IVS_country_has_gid["adm_area_2"] = None
IVS_country_has_gid["adm_area_3"] = None

In [76]:
df_to_save = IVS_country_has_gid[ ['Survey', 'Wave', 
                                   'Country Name', 'Country ISO3', 
                                   'adm_area_1', 'adm_area_2', 'adm_area_3',
                                   'Sample Size'] ].copy()

In [77]:
gid_column = []

for level_0_gid in IVS_country_has_gid['Country ISO3']:
    gid_column.append([level_0_gid])
    
df_to_save["gid"] = gid_column

In [78]:
df_to_save["Properties"] = properties

In [79]:
df_to_save.head()

Unnamed: 0,Survey,Wave,Country Name,Country ISO3,adm_area_1,adm_area_2,adm_area_3,Sample Size,gid,Properties
0,EVS,1981-1984,Belgium,BEL,,,,1145,[BEL],{'A001': {'Label': 'Important in life: Family'...
1,EVS,1981-1984,Canada,CAN,,,,1254,[CAN],{'A001': {'Label': 'Important in life: Family'...
2,EVS,1981-1984,Denmark,DNK,,,,1182,[DNK],{'A001': {'Label': 'Important in life: Family'...
3,EVS,1981-1984,France,FRA,,,,1200,[FRA],{'A001': {'Label': 'Important in life: Family'...
4,EVS,1981-1984,Germany,DEU,,,,1305,[DEU],{'A001': {'Label': 'Important in life: Family'...


In [80]:
for idx, row in df_to_save[df_to_save["Country ISO3"] == "ITA"].iterrows():
    print(row.Properties['A001'], "\n")

{'Label': 'Important in life: Family', 'Categories': {'1': 'Very important', '2': 'Rather important', '3': 'Not very important', '4': 'Not at all important', '-5': 'Missing; Unknown', '-4': 'Not asked in survey', '-3': 'Not applicable', '-2': 'No answer', '-1': 'Don´t know'}, 'Frequencies': {'A001_-5': 0.0, 'A001_-4': 0.9999999783424628, 'A001_-2': 0.0, 'A001_-1': 0.0, 'A001_1': 0.0, 'A001_2': 0.0, 'A001_3': 0.0, 'A001_4': 0.0}} 

{'Label': 'Important in life: Family', 'Categories': {'1': 'Very important', '2': 'Rather important', '3': 'Not very important', '4': 'Not at all important', '-5': 'Missing; Unknown', '-4': 'Not asked in survey', '-3': 'Not applicable', '-2': 'No answer', '-1': 'Don´t know'}, 'Frequencies': {'A001_-5': 0.0, 'A001_-4': 0.0, 'A001_-2': 0.0, 'A001_-1': 0.0005334439491662914, 'A001_1': 0.8812247075435845, 'A001_2': 0.10511809376071313, 'A001_3': 0.010436732079521996, 'A001_4': 0.0026869769291339154}} 

{'Label': 'Important in life: Family', 'Categories': {'1': 'V

In [81]:
df_to_save.to_pickle("to_upload/IVS_country_GID.pkl")

<br>
<br>
<br>
<br>
<br>

# IVS Region Data

### Load the IVS data grouped by region

In [82]:
IVS_region = pd.read_pickle("temp/IVS_grouped_by_country_region.pkl")
IVS_region.head()

Unnamed: 0,Survey,Wave,Country Numeric,Country Name,Country ISO3,Region Numeric,Region Name,NUTS1,NUTS2,NUTS3,...,X053_1,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10
0,EVS,1981-1984,56,Belgium,BEL,56021,BE: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,EVS,1981-1984,56,Belgium,BEL,56022,BE: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,EVS,1981-1984,56,Belgium,BEL,56023,BE: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EVS,1981-1984,56,Belgium,BEL,56024,BE: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EVS,1981-1984,56,Belgium,BEL,56025,BE: unknown,-4,-4,-4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Load the region codes to GID dictionaries and add the list of GID to the IVS data

### WVS/EVS

In [93]:
VWS_to_gid = pd.read_csv("input/gid/region_codes_to_GID_WVS_EVS.csv", low_memory=False)
VWS_to_gid.drop("Region Name", inplace=True, axis=1)
#remove empty lines
VWS_to_gid.dropna(how="all", inplace=True) 
VWS_to_gid['Country Numeric'] = VWS_to_gid['Country Numeric'].astype(np.int64)
VWS_to_gid['Region Numeric'] = VWS_to_gid['Region Numeric'].astype(np.int64)
VWS_to_gid.head()

Unnamed: 0,Survey,Wave,Country Numeric,Country ISO3,Region Numeric,gid
0,WVS,2010-2014,840,USA,840001,USA.20_1
1,WVS,2010-2014,840,USA,840001,USA.46_1
2,WVS,2010-2014,840,USA,840001,USA.30_1
3,WVS,2010-2014,840,USA,840001,USA.22_1
4,WVS,2010-2014,840,USA,840001,USA.40_1


In [94]:
# aggregate the gid 
VWS_to_list_gid = VWS_to_gid.groupby(list(VWS_to_gid.columns[:-1])).agg({'gid':lambda x: list(x)}).reset_index()
VWS_to_list_gid.head()

Unnamed: 0,Survey,Wave,Country Numeric,Country ISO3,Region Numeric,gid
0,WVS,1999-2004,50,BGD,50001,[BGD.3.1_1]
1,WVS,1999-2004,50,BGD,50002,[BGD.2.4_1]
2,WVS,1999-2004,50,BGD,50003,[BGD.1.2_1]
3,WVS,1999-2004,50,BGD,50004,[BGD.3.10_1]
4,WVS,1999-2004,50,BGD,50005,[BGD.3.17_1]


In [95]:
len(VWS_to_list_gid["Region Numeric"].unique())

372

In [96]:
# slice the IVS_region, we consider only the region for witch we have mapping to gids 
IVS_subset_VWS = pd.merge(IVS_region, VWS_to_list_gid, 
                on = ['Survey', 'Wave', 'Country Numeric', 'Country ISO3',
                      'Region Numeric'],
                sort=False)

In [97]:
IVS_subset_VWS.head()

Unnamed: 0,Survey,Wave,Country Numeric,Country Name,Country ISO3,Region Numeric,Region Name,NUTS1,NUTS2,NUTS3,...,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10,gid
0,WVS,1999-2004,50,Bangladesh,BGD,50001,BD: Dhaka,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[BGD.3.1_1]
1,WVS,1999-2004,50,Bangladesh,BGD,50002,BD: Chittagong,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[BGD.2.4_1]
2,WVS,1999-2004,50,Bangladesh,BGD,50003,BD: Barisal,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[BGD.1.2_1]
3,WVS,1999-2004,50,Bangladesh,BGD,50004,BD: Mymensingh,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[BGD.3.10_1]
4,WVS,1999-2004,50,Bangladesh,BGD,50005,BD: Tangail,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[BGD.3.17_1]


In [98]:
# lets check if we left something out
VWS_to_list_gid[~VWS_to_list_gid["Region Numeric"].isin(IVS_subset_VWS["Region Numeric"])]

Unnamed: 0,Survey,Wave,Country Numeric,Country ISO3,Region Numeric,gid


### NUTS

In [104]:
NUTS_to_gid = pd.read_csv("input/gid/region_codes_to_GID_NUTS.csv", low_memory=False)
NUTS_to_gid.drop(["Region Numeric", "NUTS3"], inplace=True, axis=1)
#remove empty lines
NUTS_to_gid.dropna(how="all", inplace=True) 
NUTS_to_gid['Country Numeric'] = NUTS_to_gid['Country Numeric'].astype(np.int64)

In [105]:
len(NUTS_to_gid.NUTS2.unique())

105

In [111]:
NUTS_to_list_gid = NUTS_to_gid.groupby(list(NUTS_to_gid.columns[:-1])).agg({'gid':lambda x: list(x)}).reset_index()
NUTS_to_list_gid.head()

Unnamed: 0,Survey,Wave,Country Numeric,Country ISO3,Region Name,NUTS1,NUTS2,gid
0,EVS,2008-2010,56,BEL,Unknown,BE1,BE10,[BEL.1.1_1]
1,EVS,2008-2010,56,BEL,Unknown,BE2,BE21,[BEL.2.1_1]
2,EVS,2008-2010,56,BEL,Unknown,BE2,BE22,[BEL.2.2_1]
3,EVS,2008-2010,56,BEL,Unknown,BE2,BE23,[BEL.2.3_1]
4,EVS,2008-2010,56,BEL,Unknown,BE2,BE24,[BEL.2.4_1]


In [116]:
# slice the IVS_region, we consider only the region for witch we have mapping to gids 
IVS_subset_NUTS = pd.merge(IVS_region, NUTS_to_list_gid, 
                on = ['Survey', 'Wave', 'Country Numeric', 'Country ISO3',
                      'NUTS1', 'NUTS2'],
                sort=False)
IVS_subset_NUTS.drop(["Region Name_x"], inplace=True, axis=1)
IVS_subset_NUTS.rename(columns={"Region Name_y": "Region Name"}, inplace=True)

In [117]:
IVS_subset_NUTS.head()

Unnamed: 0,Survey,Wave,Country Numeric,Country Name,Country ISO3,Region Numeric,NUTS1,NUTS2,NUTS3,Sample Size,...,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10,Region Name,gid
0,EVS,2008-2010,56,Belgium,BEL,-5,BE1,BE10,-5,127,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Unknown,[BEL.1.1_1]
1,EVS,2008-2010,56,Belgium,BEL,-5,BE2,BE21,-5,225,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Unknown,[BEL.2.1_1]
2,EVS,2008-2010,56,Belgium,BEL,-5,BE2,BE22,-5,114,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Unknown,[BEL.2.2_1]
3,EVS,2008-2010,56,Belgium,BEL,-5,BE2,BE23,-5,165,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Unknown,[BEL.2.3_1]
4,EVS,2008-2010,56,Belgium,BEL,-5,BE2,BE24,-5,123,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Unknown,[BEL.2.4_1]


## Concatenate the two dataframes

In [118]:
IVS_region = pd.concat([IVS_subset_VWS, IVS_subset_NUTS], ignore_index=True)
IVS_region.head()

Unnamed: 0,Survey,Wave,Country Numeric,Country Name,Country ISO3,Region Numeric,Region Name,NUTS1,NUTS2,NUTS3,...,X053_2,X053_3,X053_4,X053_5,X053_6,X053_7,X053_8,X053_9,X053_10,gid
0,WVS,1999-2004,50,Bangladesh,BGD,50001,BD: Dhaka,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[BGD.3.1_1]
1,WVS,1999-2004,50,Bangladesh,BGD,50002,BD: Chittagong,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[BGD.2.4_1]
2,WVS,1999-2004,50,Bangladesh,BGD,50003,BD: Barisal,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[BGD.1.2_1]
3,WVS,1999-2004,50,Bangladesh,BGD,50004,BD: Mymensingh,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[BGD.3.10_1]
4,WVS,1999-2004,50,Bangladesh,BGD,50005,BD: Tangail,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[BGD.3.17_1]


<br>
<br>
<br>

## Create dictionary with the anwers descriptions

In [126]:
IVS_variables = pd.read_csv("input/IVS/IVS_Variable_List.csv")

In [127]:
IVS_variables = IVS_variables.loc[IVS_variables.Included == 1].copy()
IVS_variables.drop(["Included", "Last EVS", "Last WVS"], axis=1, inplace=True)

In [128]:
cat_dict_column = []

for idx, row in IVS_variables.iterrows():
    codes = {}
    for line in row.Categories.split("\n")[:-1]: # there is always a \n at the end
        try:
            key = line.split(":")[0]
            value = line.split(":")[1]
            codes[key] = value
        except:
            continue
    cat_dict_column.append(codes)

In [129]:
IVS_variables["Categories"] = cat_dict_column

In [130]:
IVS_variables.set_index("Name", inplace=True)

In [131]:
variables_dict = IVS_variables.to_dict(orient="index")

## We need to pop some keys
ZMEN_010 is not present in the dataset

the following ones are the values we aggregated on

S002
S002EVS
S001
S017

X048WVS
X048
x048a_n1
x048b_n2
x048c_n3

In [132]:
len(variables_dict.keys())

86

In [133]:
to_drop = ["S002", "S002EVS", "S001", "S017",
          "X048WVS", "X048", "x048a_n1", "x048b_n2", "x048c_n3"]


for key in to_drop:
    variables_dict.pop(key)

len(variables_dict.keys())

77

### Dump the answers in a json

In [134]:
IVS_region['json'] = IVS_region.apply(lambda x: x.to_dict(), axis=1)

In [135]:
import copy

# now we have a flat dict, we will need to organize it
properties = []

for idx, row in IVS_region.iterrows():
    new_dict = {}
    
    # for each row we create a dict
    # {..., 
    #  column_id: {label: 'description',
    #              frequencies: {dict with the anwers onehot encoded}
    #.             categories: {dict with the answers codes}}, 
    #  ...}
    
    # label and categories are from variables_dict
    
    # add the original region labels
    new_dict["original_region_code"]= {
        'Region Numeric': row.json['Region Numeric'],
        'Region Name': row.json['Region Name'],
        'NUTS1': row.json['NUTS1'],
        'NUTS2': row.json['NUTS2'],
        'NUTS3': row.json['NUTS3'],
    }
    
    
    for answer_code in variables_dict.keys():
        
        # we already have label and categories
        sub_dict = variables_dict[answer_code]
        
        # we need frequencies
        # in the row json we have a flat dict
        # we want to aggregate by name of the answer
        sub_dict["Frequencies"] = {}
        
        for key, item in row.json.items():
            if key.startswith(answer_code):
                sub_dict["Frequencies"][key] = item
        new_dict[answer_code] = sub_dict
        
    properties.append(copy.deepcopy(new_dict))    

In [137]:
# add adm_area_* columns
IVS_region["adm_area_1"] = IVS_region["Region Name"]
IVS_region["adm_area_2"] = None
IVS_region["adm_area_3"] = None

In [140]:
df_to_save = IVS_region[['Survey', 'Wave', 
                         'Country Name', 'Country ISO3', 'gid',
                         'adm_area_1', 'adm_area_2', 'adm_area_3',
                         'Sample Size']].copy()
df_to_save["Properties"] = properties

In [145]:
df_to_save.head()

Unnamed: 0,Survey,Wave,Country Name,Country ISO3,gid,adm_area_1,adm_area_2,adm_area_3,Sample Size,Properties
0,WVS,1999-2004,Bangladesh,BGD,[BGD.3.1_1],BD: Dhaka,,,199,{'original_region_code': {'Region Numeric': 50...
1,WVS,1999-2004,Bangladesh,BGD,[BGD.2.4_1],BD: Chittagong,,,121,{'original_region_code': {'Region Numeric': 50...
2,WVS,1999-2004,Bangladesh,BGD,[BGD.1.2_1],BD: Barisal,,,105,{'original_region_code': {'Region Numeric': 50...
3,WVS,1999-2004,Bangladesh,BGD,[BGD.3.10_1],BD: Mymensingh,,,128,{'original_region_code': {'Region Numeric': 50...
4,WVS,1999-2004,Bangladesh,BGD,[BGD.3.17_1],BD: Tangail,,,81,{'original_region_code': {'Region Numeric': 50...


In [146]:
df_to_save.to_pickle("to_upload/IVS_region_GID.pkl")

<br>
<br>
<br>
<br>
<br>

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>

# Merge the data

In [147]:
import pandas as pd

### IVS country

In [152]:
IVS_country = pd.read_pickle("to_upload/IVS_country_GID.pkl")
IVS_country.head()

Unnamed: 0,Survey,Wave,Country Name,Country ISO3,adm_area_1,adm_area_2,adm_area_3,Sample Size,gid,Properties
0,EVS,1981-1984,Belgium,BEL,,,,1145,[BEL],{'A001': {'Label': 'Important in life: Family'...
1,EVS,1981-1984,Canada,CAN,,,,1254,[CAN],{'A001': {'Label': 'Important in life: Family'...
2,EVS,1981-1984,Denmark,DNK,,,,1182,[DNK],{'A001': {'Label': 'Important in life: Family'...
3,EVS,1981-1984,France,FRA,,,,1200,[FRA],{'A001': {'Label': 'Important in life: Family'...
4,EVS,1981-1984,Germany,DEU,,,,1305,[DEU],{'A001': {'Label': 'Important in life: Family'...


In [153]:
IVS_country = IVS_country.rename(columns={"Survey": "source",
                                          "Wave": "wave",
                                          "Country Name": "country",
                                          "Country ISO3": "countrycode",
                                          "Sample Size": "samplesize",
                                          "Properties": "properties"})
IVS_country.head()

Unnamed: 0,source,wave,country,countrycode,adm_area_1,adm_area_2,adm_area_3,samplesize,gid,properties
0,EVS,1981-1984,Belgium,BEL,,,,1145,[BEL],{'A001': {'Label': 'Important in life: Family'...
1,EVS,1981-1984,Canada,CAN,,,,1254,[CAN],{'A001': {'Label': 'Important in life: Family'...
2,EVS,1981-1984,Denmark,DNK,,,,1182,[DNK],{'A001': {'Label': 'Important in life: Family'...
3,EVS,1981-1984,France,FRA,,,,1200,[FRA],{'A001': {'Label': 'Important in life: Family'...
4,EVS,1981-1984,Germany,DEU,,,,1305,[DEU],{'A001': {'Label': 'Important in life: Family'...


### IVS region

In [154]:
IVS_region = pd.read_pickle("to_upload/IVS_region_GID.pkl")
IVS_region = IVS_region.rename(columns={"Survey": "source",
                                          "Wave": "wave",
                                          "Country Name": "country",
                                          "Country ISO3": "countrycode",
                                          "Sample Size": "samplesize",
                                          "Properties": "properties"})

IVS_region.head()

Unnamed: 0,source,wave,country,countrycode,gid,adm_area_1,adm_area_2,adm_area_3,samplesize,properties
0,WVS,1999-2004,Bangladesh,BGD,[BGD.3.1_1],BD: Dhaka,,,199,{'original_region_code': {'Region Numeric': 50...
1,WVS,1999-2004,Bangladesh,BGD,[BGD.2.4_1],BD: Chittagong,,,121,{'original_region_code': {'Region Numeric': 50...
2,WVS,1999-2004,Bangladesh,BGD,[BGD.1.2_1],BD: Barisal,,,105,{'original_region_code': {'Region Numeric': 50...
3,WVS,1999-2004,Bangladesh,BGD,[BGD.3.10_1],BD: Mymensingh,,,128,{'original_region_code': {'Region Numeric': 50...
4,WVS,1999-2004,Bangladesh,BGD,[BGD.3.17_1],BD: Tangail,,,81,{'original_region_code': {'Region Numeric': 50...


### Merge

In [157]:
surveys = pd.concat([IVS_country, IVS_region], ignore_index=True)
surveys.to_pickle("to_upload/surveys.pkl", protocol = 3)
surveys

Unnamed: 0,source,wave,country,countrycode,adm_area_1,adm_area_2,adm_area_3,samplesize,gid,properties
0,EVS,1981-1984,Belgium,BEL,,,,1145,[BEL],{'A001': {'Label': 'Important in life: Family'...
1,EVS,1981-1984,Canada,CAN,,,,1254,[CAN],{'A001': {'Label': 'Important in life: Family'...
2,EVS,1981-1984,Denmark,DNK,,,,1182,[DNK],{'A001': {'Label': 'Important in life: Family'...
3,EVS,1981-1984,France,FRA,,,,1200,[FRA],{'A001': {'Label': 'Important in life: Family'...
4,EVS,1981-1984,Germany,DEU,,,,1305,[DEU],{'A001': {'Label': 'Important in life: Family'...
...,...,...,...,...,...,...,...,...,...,...
828,EVS,2008-2010,United Kingdom of Great Britain and Northern I...,GBR,Unknown,,,22,"[GBR.4.13_1, GBR.4.15_1, GBR.4.5_1, GBR.4.21_1...",{'original_region_code': {'Region Numeric': -5...
829,EVS,2008-2010,United Kingdom of Great Britain and Northern I...,GBR,Unknown,,,43,"[GBR.3.3_1, GBR.3.5_1, GBR.3.7_1, GBR.3.10_1, ...",{'original_region_code': {'Region Numeric': -5...
830,EVS,2008-2010,United Kingdom of Great Britain and Northern I...,GBR,Unknown,,,43,"[GBR.3.4_1, GBR.3.6_1, GBR.3.8_1, GBR.3.9_1, G...",{'original_region_code': {'Region Numeric': -5...
831,EVS,2008-2010,United Kingdom of Great Britain and Northern I...,GBR,Unknown,,,6,"[GBR.3.4_1, GBR.3.13_1, GBR.3.17_1, GBR.3.20_1...",{'original_region_code': {'Region Numeric': -5...


<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>