# Bayesian Values Guesser

## Retrieving Data 

In [1]:
import pandas as pd
import numpy as np

### Save paths to data files
(You have to download these yourself, from http://www.worldvaluessurvey.org/WVSDocumentationWV6.jsp)

In [2]:
CODEBOOK_FILE = 'data/F00003861-WV6_Codebook_v_2014_11_07.xls'
RESPONSES_FILE = 'data/WV6_Data_ascii_delimited_v_2016_01_01.dat'

### Load Codebook

In [4]:
# Read the codebook excel file
codebook = pd.read_excel(io=CODEBOOK_FILE, sheet='master', header=3)

# Make headers lowercase
codebook.columns = codebook.columns.str.lower()

# Set var name as index
codebook.set_index('var', inplace=True)

# Drop some unwanted columns
codebook.drop(['filter', 'length'], axis=1, inplace=True)

# Drop some unwanted rows
codebook = codebook.iloc[:339]

In [7]:
codebook.head()

Unnamed: 0_level_0,label,question,categories
var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
V1,Wave,Wave number,1##1981-1984\n2##1989-1993\n3##1994-1999\n4##1...
V2,Country Code,Country code,
V2A,Country/regions [with split ups],Country/regions [with split ups]\r\n\r\nThis i...,8##Albania\n12##Algeria\n16##American Samoa\n2...
V3,Interview number,Interview number,1#99999999#1..999999999\n-5##Missing; Unknown\...
V4,Important in life: Family,"For each of the following, indicate how import...",1##Very important\n2##Rather important\n3##Not...


In [8]:
# When I need the categories, I can parse them
def parse_cat(var):
    vals = codebook.loc[var, 'categories']
    vals = vals.rstrip('\n').split('\n')
    return pd.DataFrame([x.split('##') for x in vals])

In [9]:
# E.g.,
parse_cat('V4')

Unnamed: 0,0,1
0,1,Very important
1,2,Rather important
2,3,Not very important
3,4,Not at all important
4,-5,"BH: Missing; AR,DE,SE: Inapplicable;RU: Inappr..."
5,-4,Not asked in survey
6,-3,Not applicable
7,-2,No answer
8,-1,Don´t know


### Responses

In [10]:
# Read responses from csv
responses = pd.read_csv(RESPONSES_FILE, header=None)

# Discard some unwanted columns
responses = responses.loc[:, :338]

# Set values as index
responses.columns = codebook.index.values

In [11]:
responses.head(n=10)

Unnamed: 0,V1,V2,V2A,V3,V4,V5,V6,V7,V8,V9,...,V256B,V256C,V257,V258,V260,V261,V262,V263,V264,V265
0,6,12,12,1.0,1,1,1,-2,1,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
1,6,12,12,2.0,1,2,3,4,2,2,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
2,6,12,12,3.0,1,3,2,4,2,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
3,6,12,12,4.0,1,1,3,4,3,1,...,-4,-4,-4,12002,1.0,1.0,5,-4,2013,126
4,6,12,12,5.0,1,1,1,2,1,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
5,6,12,12,6.0,1,2,2,2,4,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
6,6,12,12,7.0,1,1,1,1,1,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
7,6,12,12,8.0,1,1,1,1,2,2,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
8,6,12,12,9.0,1,1,1,2,2,2,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
9,6,12,12,10.0,1,1,1,2,1,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126


## What percentage of Algerian females believe in God?

In [10]:
my_vars = {'country' : 'V2A',
           'sex' : 'V240',
           'god': 'V148'}

Find values

In [20]:
# Country Algeria
country = parse_cat(my_vars['country'])
country.loc[country[1] == 'Algeria']

Unnamed: 0,0,1
1,12,Algeria


In [21]:
# Sex female
sex = parse_cat(my_vars['sex'])
sex.loc[sex[1] == 'Female']

Unnamed: 0,0,1
1,2,Female


In [23]:
# God
god = parse_cat(my_vars['god'])
god.loc[god[1] == 'Yes']

Unnamed: 0,0,1
0,1,Yes


In [24]:
responses.query('(V2A == 12) & (V240 == 2) & (V148 == 1)')

Unnamed: 0,V1,V2,V2A,V3,V4,V5,V6,V7,V8,V9,...,V256B,V256C,V257,V258,V260,V261,V262,V263,V264,V265
1,6,12,12,2.0,1,2,3,4,2,2,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
2,6,12,12,3.0,1,3,2,4,2,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
3,6,12,12,4.0,1,1,3,4,3,1,...,-4,-4,-4,12002,1.0,1.0,5,-4,2013,126
4,6,12,12,5.0,1,1,1,2,1,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
7,6,12,12,8.0,1,1,1,1,2,2,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
9,6,12,12,10.0,1,1,1,2,1,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
10,6,12,12,11.0,1,2,3,4,1,1,...,-4,-4,-4,19,1.0,1.0,5,-4,2013,126
13,6,12,12,14.0,1,1,1,1,1,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
15,6,12,12,16.0,1,2,2,3,3,4,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
17,6,12,12,18.0,1,1,1,2,1,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126


## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn import datasets

In [None]:
iris = datasets.load_iris()

In [None]:
iris.data

In [None]:
iris.target

In [None]:
responses.head()

In [25]:
responses.sample(frac=.80)

Unnamed: 0,V1,V2,V2A,V3,V4,V5,V6,V7,V8,V9,...,V256B,V256C,V257,V258,V260,V261,V262,V263,V264,V265
66503,6,702,702,3320.0,1,1,3,2,1,1,...,-4,-4,-4,128,1.195181,1.195181,3,-4,2012,7026
31718,6,368,368,160.0,1,1,3,4,1,1,...,4,4,-4,1088,1.000000,1.000000,5,20131201,2012,3686
1180,6,12,12,1181.0,1,1,2,3,2,1,...,-4,-4,-4,12001,1.000000,1.000000,5,-4,2013,126
81545,6,792,792,1533.0,1,1,1,3,1,1,...,-4,-4,-4,1165,1.008543,1.008543,2,20110714,2011,7926
17285,6,818,818,275.0,1,3,3,2,1,1,...,-4,8180098,-4,19,0.101296,0.101296,4,20130309,2013,8186
2625,6,51,51,396.0,1,1,1,4,1,1,...,-4,-4,51007,25,0.710284,0.710284,2,20111004,2011,516
48764,6,528,528,812683.0,1,3,1,3,2,4,...,528002,-4,-4,120,1.000000,1.000000,4,20121203,2012,5286
43792,6,434,434,2176.0,1,1,1,2,1,1,...,-4,-4,-4,19,0.401586,0.401586,4,20140128,2014,4346
26428,6,356,356,529.0,1,2,3,4,1,1,...,-4,-4,-4,179,1.000000,1.000000,4,-4,2012,3566
5139,6,31,31,333.0,1,1,3,4,3,3,...,31012,-4,-4,40,0.388018,0.388018,3,20111218,2011,316
