# Bayesian Values Guesser

> Dustin Michels<br>
> August 2017<br>
> Creative Commons

---
## I. Retrieving Data 

In [1]:
import pandas as pd
import numpy as np
import ipywidgets as ipyw

### Save paths to data files
(You have to download these yourself, from http://www.worldvaluessurvey.org/WVSDocumentationWV6.jsp)

In [2]:
CODEBOOK_FILE = 'data/F00003861-WV6_Codebook_v_2014_11_07.xls'
RESPONSES_FILE = 'data/WV6_Data_ascii_delimited_v_2016_01_01.dat'

### Load Codebook

In [3]:
# Read the codebook excel file
codebook = pd.read_excel(io=CODEBOOK_FILE, sheet='master', header=3)
# Make headers lowercase
codebook.columns = codebook.columns.str.lower()
# Set var name as index
codebook.set_index('var', inplace=True)
# Drop some unwanted columns
codebook.drop(['filter', 'length'], axis=1, inplace=True)
# Drop some unwanted rows
codebook = codebook.iloc[:339]

In [4]:
codebook.head()

Unnamed: 0_level_0,label,question,categories
var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
V1,Wave,Wave number,1##1981-1984\n2##1989-1993\n3##1994-1999\n4##1...
V2,Country Code,Country code,
V2A,Country/regions [with split ups],Country/regions [with split ups]\r\n\r\nThis i...,8##Albania\n12##Algeria\n16##American Samoa\n2...
V3,Interview number,Interview number,1#99999999#1..999999999\n-5##Missing; Unknown\...
V4,Important in life: Family,"For each of the following, indicate how import...",1##Very important\n2##Rather important\n3##Not...


In [5]:
# When I need the categories, I can parse them
def parse_cat(var):
    vals = codebook.loc[var, 'categories']
    vals = vals.rstrip('\n').split('\n')
    return pd.DataFrame([x.split('##') for x in vals])

In [6]:
# E.g.,
parse_cat('V4')

Unnamed: 0,0,1
0,1,Very important
1,2,Rather important
2,3,Not very important
3,4,Not at all important
4,-5,"BH: Missing; AR,DE,SE: Inapplicable;RU: Inappr..."
5,-4,Not asked in survey
6,-3,Not applicable
7,-2,No answer
8,-1,Don´t know


### Responses

In [7]:
# Read responses from csv
responses = pd.read_csv(RESPONSES_FILE, header=None)
# Discard some unwanted columns
responses = responses.loc[:, :338]
# Set values as index
responses.columns = codebook.index.values

In [8]:
responses.head()

Unnamed: 0,V1,V2,V2A,V3,V4,V5,V6,V7,V8,V9,...,V256B,V256C,V257,V258,V260,V261,V262,V263,V264,V265
0,6,12,12,1.0,1,1,1,-2,1,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
1,6,12,12,2.0,1,2,3,4,2,2,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
2,6,12,12,3.0,1,3,2,4,2,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126
3,6,12,12,4.0,1,1,3,4,3,1,...,-4,-4,-4,12002,1.0,1.0,5,-4,2013,126
4,6,12,12,5.0,1,1,1,2,1,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,126


---
## II. Selecting Categories of Interest

## What percentage of Algerian females believe in God?

In [6]:
my_vars = {'country' : 'V2A',
           'sex' : 'V240',
           'god': 'V148'}

Find values

In [7]:
# Country cuba
country = parse(my_vars['country'])
country.loc[country[1] == 'Algeria']

Unnamed: 0,0,1
1,12,Algeria


In [8]:
# Sex female
sex = parse(my_vars['sex'])
sex.loc[sex[1] == 'Female']

Unnamed: 0,0,1
1,2,Female


In [9]:
# God
god = parse(my_vars['god'])
god.loc[god[1] == 'Yes']

Unnamed: 0,0,1
0,1,Yes


In [14]:
responses.query('(V2A == 12) & (V240 == 2) & (V148 == 1)')

Unnamed: 0,V1,V2,V2A,V3,V4,V5,V6,V7,V8,V9,...,MN_237B7,MN_237C1,MN_237C2,MN_237C3,MN_237C4,MN_237C5,MN_237C6,MN_249A1,MN_249A2,MN_249A3
1,6,12,12,2.0,1,2,3,4,2,2,...,-3,-3,-3,-3,-3,-3,-3,-3,-3,2
2,6,12,12,3.0,1,3,2,4,2,1,...,-3,-3,-3,-3,-3,-3,-3,-3,-3,1
3,6,12,12,4.0,1,1,3,4,3,1,...,-3,-3,-3,-3,-3,-3,-3,-3,-3,1
4,6,12,12,5.0,1,1,1,2,1,1,...,-3,-3,-3,-3,-3,-3,-3,-3,-3,1
7,6,12,12,8.0,1,1,1,1,2,2,...,-3,-3,-3,-3,-3,-3,-3,-3,-3,2
9,6,12,12,10.0,1,1,1,2,1,1,...,0,0,0,0,1,0,0,0,0,-3
10,6,12,12,11.0,1,2,3,4,1,1,...,-3,-3,-3,-3,-3,-3,-3,-3,-3,2
13,6,12,12,14.0,1,1,1,1,1,1,...,0,0,0,1,0,0,0,0,0,1
15,6,12,12,16.0,1,2,2,3,3,4,...,0,0,0,0,0,1,0,0,0,2
17,6,12,12,18.0,1,1,1,2,1,1,...,0,0,0,1,0,0,0,0,0,1


## Naive Bayes

In [17]:
from sklearn.naive_bayes import GaussianNB
from sklearn import datasets

In [23]:
iris = datasets.load_iris()

In [25]:
iris.data

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4

In [26]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [29]:
responses.head()

Unnamed: 0,V1,V2,V2A,V3,V4,V5,V6,V7,V8,V9,...,MN_237B7,MN_237C1,MN_237C2,MN_237C3,MN_237C4,MN_237C5,MN_237C6,MN_249A1,MN_249A2,MN_249A3
0,6,12,12,1.0,1,1,1,-2,1,1,...,-3,-3,-3,-3,-3,-3,-3,-3,-3,1
1,6,12,12,2.0,1,2,3,4,2,2,...,-3,-3,-3,-3,-3,-3,-3,-3,-3,2
2,6,12,12,3.0,1,3,2,4,2,1,...,-3,-3,-3,-3,-3,-3,-3,-3,-3,1
3,6,12,12,4.0,1,1,3,4,3,1,...,-3,-3,-3,-3,-3,-3,-3,-3,-3,1
4,6,12,12,5.0,1,1,1,2,1,1,...,-3,-3,-3,-3,-3,-3,-3,-3,-3,1


In [37]:
responses.sample(frac=.80)

Unnamed: 0,V1,V2,V2A,V3,V4,V5,V6,V7,V8,V9,...,MN_237B7,MN_237C1,MN_237C2,MN_237C3,MN_237C4,MN_237C5,MN_237C6,MN_249A1,MN_249A2,MN_249A3
72936,6,710,710,3492.0,1,1,1,1,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
38245,6,414,414,1149230.0,1,2,-2,3,1,1,...,0,0,0,0,0,0,0,1,0,1
89503,6,716,716,654.0,1,2,1,2,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
27650,6,356,356,1751.0,1,1,1,3,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
3292,6,51,51,1063.0,1,1,2,1,2,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
59802,6,642,642,129.0,1,2,2,3,1,1,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
4640,6,36,36,86277.0,1,2,1,4,2,4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
74949,6,752,752,934.0,1,1,3,3,4,2,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
63656,6,643,643,2480.0,1,2,2,3,2,2,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
12473,6,196,196,932.0,1,1,1,1,2,3,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
