In [1]:
import pandas as pd

In [2]:
from data_prep import *

In [3]:
responses, codebook = load_files()

In [37]:
all_vars = {
    'V2A': 'country code',
    'V4': 'import in life: family',
    'V6': 'import in life: leisure time',
    'V8': 'import in life: work',
    'V10': 'feeling of happiness',
    'V23': 'life satisfaction',
    'V26': 'active member: sports',
    'V29': 'active member: political party',
    'V51': 'men better political leaders',
    'V52': 'university edu for boy more important',
    'V57': 'marital status',
    'V66': 'willingness to fight for country',
    'V79': 'Tradition important',
    'V80': 'most serious world problem',
    'V95': 'self positioning on political scale',
    'V97': 'private vs state ownership',
    'V100': 'Hard work brings success',
    'V148': 'Believe in God',
    'V160F': 'Outgoing, sociable',
    'V160J': 'Active imagination',
    'V179': 'victim of crime',
    'V178': 'carried knife for security',
    'V187': 'war is necessary',
    'V209': 'Justifiable parents beat children',
    'V211': 'proud of nationality',
    'V225': 'how often use personal computer',
    'V227': 'how often vote',
    'V238': 'social class subjective',
    'V248': 'highest educational level',
}

In [38]:
# Drop negative responses
df = responses.loc[:,all_vars]
df = df.mask(df < 0).dropna()

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.dummy import DummyClassifier

In [40]:
def bayes(df, all_vars, test_var):
    
    y = df[test_var]
    X = df.drop(test_var, axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)
    
    gnb = MultinomialNB()
    gnb.fit(X_train, y_train)
    gnb_score = gnb.score(X_test, y_test)
    
    base = DummyClassifier(strategy='most_frequent',random_state=0)
    base.fit(X_train, y_train)
    base_score = base.score(X_test, y_test)
    
    return gnb_score, base_score

In [41]:
res = []
for k, v in all_vars.items():
    score, base_score = bayes(df, all_vars, test_var=k)
    res.append(
        dict(var=k, des=v, score=score, baseline=base_score))

In [42]:
res_df = pd.DataFrame(res, columns=['var','des','score','baseline'])

In [43]:
res_df['improve'] = res_df['score'] - res_df['baseline']

In [48]:
res_df.sort_values(by=['baseline'], ascending=False)

Unnamed: 0,var,des,score,baseline,improve
20,V179,victim of crime,0.728721,0.90117,-0.172449
21,V178,carried knife for security,0.891489,0.897943,-0.006454
1,V4,import in life: family,0.723276,0.866478,-0.143203
17,V148,Believe in God,0.671238,0.832594,-0.161355
7,V29,active member: political party,0.602662,0.799919,-0.197257
6,V26,active member: sports,0.546995,0.69161,-0.144615
11,V66,willingness to fight for country,0.585518,0.666599,-0.081081
3,V8,import in life: work,0.47862,0.615974,-0.137354
26,V227,how often vote,0.46309,0.604679,-0.141589
24,V211,proud of nationality,0.403388,0.590157,-0.186769


In [7]:
# df_nan = df.where(df > 0)
# df_nan.isnull().sum()

In [26]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(gnb, X, y, cv=5)
scores

array([0.812074  , 0.8247191 , 0.85198502, 0.84089888, 0.81138577])

In [107]:
all_vars = {
    'V2A': 'country code',
    'V4': 'import in life: family',
    'V5': 'import in life: friends',
    'V6': 'import in life: leisure time',
    'V7': 'import in life: politics',
    'V8': 'import in life: work',
    'V9': 'import in life: religion',
    'V10': 'feeling of happiness',
    'V23': 'life satisfaction',
    'V25': 'active member: church',
    'V26': 'active member: sports',
    'V29': 'active member: political party',
    'V51': 'men better political leaders',
    'V52': 'university edu for boy more important',
    'V57': 'marital status',
    'V66': 'willingness to fight for country',
    'V76': 'Adventure and risks important',
    'V79': 'Tradition important',
    'V80': 'most serious world problem',
    'V95': 'self positioning on political scale',
    'V97': 'private vs state ownership',
    'V100': 'Hard work brings success',
    'V148': 'Believe in God',
    'V160F': 'Outgoing, sociable',
    'V160J': 'Active imagination',
    'V179': 'victim of crime',
    'V178': 'carried knife for security',
    'V187': 'war is necessary',
    'V197': 'science is good',
    'V207A': 'Euthanasia justifiable',
    'V209': 'Justifiable parents beat children',
    'V211': 'proud of nationality',
    'V225': 'how often use personal computer',
    'V227': 'how often vote',
    'V238': 'social class subjective',
    'V248': 'highest educational level',
}