In [1]:
import pandas as pd
import numpy as np

from data_prep import (
    all_vars, load_files, about_q, parse_categories,
    locate_col_lables, get_col_labels)

In [2]:
def head_and_tail(df, n=4):
    """Show first and last n items in DF"""
    return df.iloc[np.r_[0:n, -n:0]]

## Load / View data fies

In [3]:
responses, codebook = load_files()

In [4]:
# responses has (90350) individual responses to 339 questions,
# identified by id

head_and_tail(responses, n=3)

Unnamed: 0,V1,V2,V2A,V3,V4,V5,V6,V7,V8,V9,...,V256B,V256C,V256_MAP,V257,V258,V258A,V260,V261,V262,V265
0,6,12,12,1.0,1,1,1,-2,1,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,-4
1,6,12,12,2.0,1,2,3,4,2,2,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,-4
2,6,12,12,3.0,1,3,2,4,2,1,...,-4,-4,-4,12001,1.0,1.0,5,-4,2013,-4
90347,6,716,716,1498.0,1,2,1,3,1,1,...,716014,-4,-4,405,2.2801,2.2801,3,20120115,2012,-4
90348,6,716,716,1499.0,1,2,2,3,1,1,...,716011,-4,-4,405,2.2801,2.2801,3,20120114,2012,-4
90349,6,716,716,1500.0,1,2,2,2,2,1,...,716011,-4,-4,405,2.2801,2.2801,3,20120114,2012,-4


In [5]:
# codebook attaches a label and question to each var,
# and the categories that answers correspond to

head_and_tail(codebook, n=2)

Unnamed: 0,label,question,categories
V1,Wave,Wave number,1##1981-1984\n2##1989-1993\n3##1994-1999\n4##1...
V2,Country Code,Country code,
V264,Nation Year,Nation Year,81998##Albania (1998)\n82002##Albania (2002)\n...
V265,Respondent's occupation,Respondent's occupation,1##Employer/ manager of establishment with 10 ...


#### Some helper methods

In [6]:
# print out info
about_q(codebook, 'V225')

Label: How often use of a personal computer

Question: How often, if ever, do you use a personal computer?

Categories: 
 1. Never
 2. Occasionally
 3. Frequently
 4. Don´t know what a computer is
-5. DE,EC:Inapplicable; BH: Missing; RU: Inappropriate response{Inappropriate}
-4. Not asked in survey
-3. Not applicable
-2. No answer
-1. Don´t know
 


In [7]:
# render categories as DF
parse_categories(codebook, 'V25')

Unnamed: 0,0,1
0,0,Not a member
1,1,Inactive member
2,2,Active member
3,-5,SE: Inapplicable;RU: Inappropriate response{In...
4,-4,Not asked in survey
5,-3,Not applicable
6,-2,No answer
7,-1,Don´t know


## Experiment with hand-selected subset of vars

In [8]:
all_vars

{'V10': 'feeling of happiness',
 'V100': 'Hard work brings success',
 'V148': 'Believe in God',
 'V178': 'carried knife for security',
 'V179': 'victim of crime',
 'V187': 'war is necessary',
 'V209': 'Justifiable parents beat children',
 'V211': 'proud of nationality',
 'V225': 'how often use personal computer',
 'V238': 'social class subjective',
 'V248': 'highest educational level',
 'V2A': 'country code',
 'V4': 'import in life: family',
 'V80': 'most serious world problem'}

In [9]:
# Drop negative responses
df = responses.loc[:,all_vars]
df = df.mask(df < 0).dropna()

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.dummy import DummyClassifier

def bayes(df, test_var, NB=MultinomialNB, return_clf=False):
    """Given df and test_var, runs Bayesian and baseline classifier.
    Returns results as (clf_score, base_score) tuple."""
    
    # y is test var, X is all other vars
    y = df[test_var]
    y=y.astype('float')
    X = df.drop(test_var, axis=1)
    
    # split into test/training
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)
    
    # create, fit, and score bayesian classifier
    clf = NB()
    clf.fit(X_train, y_train)
    clf_score = clf.score(X_test, y_test)
    
    # create, fit, and score baseline classifier
    base = DummyClassifier(strategy='most_frequent', random_state=0)
    base.fit(X_train, y_train)
    base_score = base.score(X_test, y_test)
    
    # return either classifier or scores
    if return_clf:
        return clf
    return clf_score, base_score

In [11]:
# iteratve over all keys, making list of dicts with score data
res = []
for k, v in all_vars.items():
    score, base_score = bayes(df, test_var=k)
    res.append(
        dict(var=k, des=v, score=score, baseline=base_score))

In [12]:
res[0]

{'baseline': 0.06242055565643726,
 'des': 'country code',
 'score': 0.1208007989831124,
 'var': 'V2A'}

In [13]:
# convert list of dicts to DF
res_df = pd.DataFrame(res, columns=['var','des','baseline','score'])

# compute "improve" column (score - baseline)
res_df['improve'] = res_df['score'] - res_df['baseline']

# sort DF by improvement
res_df.sort_values(by=['improve'], ascending=False)

Unnamed: 0,var,des,baseline,score,improve
11,V225,how often use personal computer,0.388506,0.499818,0.111313
0,V2A,country code,0.062421,0.120801,0.05838
13,V248,highest educational level,0.199383,0.215725,0.016343
7,V178,carried knife for security,0.928591,0.928591,0.0
9,V209,Justifiable parents beat children,0.515072,0.514981,-9.1e-05
5,V148,Believe in God,0.869484,0.869167,-0.000318
1,V4,import in life: family,0.920056,0.919103,-0.000953
6,V179,victim of crime,0.912657,0.907663,-0.004994
12,V238,social class subjective,0.357136,0.351507,-0.005629
4,V100,Hard work brings success,0.267432,0.261485,-0.005947


## Data Prep

### Some cleaning

In [14]:
def remove_negatives(responses, cutoff_percent=0.1):
    """Remove all rows with any negative responses.
    Pre-removes columns with more negatives than cutoff_percent"""
    
    # set value based off cutoff_percent and # of responses
    cutoff = cutoff_percent * len(responses)
    
    # count negative responses in each column (s is a Series)
    s = (responses < 0).sum()

    # keep_vars: vars where % negative under cutoff
    keep_vars = s[s <= cutoff]
    keep_vars = list(keep_vars.index)

    # filter df down to keep_vars
    df = responses[keep_vars]

    # Drop negative responses
    df = df.mask(df < 0).dropna()
    
    return df

In [15]:
df = remove_negatives(responses, cutoff_percent=0.1)
df.shape

(16644, 223)

In [16]:
def test_all_vars(df, all_vars, NB=MultinomialNB, verbose=False):
    
    # iteratve over all keys, making list of dicts with score data
    res = []
    for k, v in all_vars.items():
        print(k) if verbose else None
        
        score, base_score = bayes(df, test_var=k, NB=NB)
        res.append(dict(var=k, des=v, score=score, baseline=base_score))
        
    # convert list of dicts to DF
    res_df = pd.DataFrame(res, columns=['var','des','baseline','score'])

    # compute "improve" column (score - baseline)
    res_df['improve'] = res_df['score'] - res_df['baseline']

    # sort DF by improvement
    res_df.sort_values(by=['improve'], ascending=False)
    
    return res_df

In [24]:
drop_vars = {'V1', 'V258', 'V258A', 'V125_16'}

use_df = df.drop(drop_vars, axis=1)
all_vars = codebook.reindex(use_df.columns)['label'].to_dict()

In [25]:
test_all_vars(use_df, all_vars, verbose=False)

  self.class_log_prior_ = (np.log(self.class_count_) -


Unnamed: 0,var,des,baseline,score,improve
0,V2,Country Code,0.084653,0.517204,0.432551
1,V2A,Country/regions [with split ups],0.084653,0.517204,0.432551
2,V3,Interview number,0.002731,0.001274,-0.001456
3,V4,Important in life: Family,0.939559,0.022756,-0.916803
4,V5,Important in life: Friends,0.452394,0.426179,-0.026215
5,V6,Important in life: Leisure time,0.408156,0.359184,-0.048971
6,V7,Important in life: Politics,0.330967,0.321318,-0.009649
7,V8,Important in life: Work,0.734571,0.115420,-0.619152
8,V9,Important in life: Religion,0.606772,0.106499,-0.500273
9,V10,Feeling of happiness,0.435827,0.441289,0.005461


### Question: What's wrong with V258?

- https://stackoverflow.com/questions/43608007/valueerror-unknown-label-type-in-scikit-learn

In [26]:
# test_var = 'V257'
test_var = 'V258'
# test_var = 'V2A'

# y is test var, X is all other vars
y = df[test_var]
y = y.astype('float64')
X = df.drop(test_var, axis=1)

# split into test/training
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

y_train.head()

20579    1.000000
72380    1.439769
57196    0.762832
55349    1.002858
74273    0.647469
Name: V258, dtype: float64

In [27]:
# create, fit, and score bayesian classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf_score = clf.score(X_test, y_test)

ValueError: Unknown label type: (array([1.        , 1.43976859, 0.76283156, ..., 0.77011494, 1.0364008 ,
       1.        ]),)

In [31]:
# clf_score

In [32]:
# base = DummyClassifier(strategy='most_frequent', random_state=0)
# base.fit(X_train, y_train)
# base_score = base.score(X_test, y_test)

In [33]:
# base_score

### Pre-processing with sklearn

In [35]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [42]:
# remove cols with low variance

sel = VarianceThreshold(threshold=0.1)
df_new = sel.fit_transform(use_df)

var_df = locate_col_lables(df_new, use_df)

var_df.shape

  if np.issubdtype(mask.dtype, np.int):


(16644, 214)

In [43]:
all_vars = codebook.reindex(var_df.columns)['label'].to_dict()

In [44]:
test_all_vars(var_df, all_vars, verbose=False)

Unnamed: 0,var,des,baseline,score,improve
0,V2,Country Code,0.084653,0.517204,0.432551
1,V2A,Country/regions [with split ups],0.084653,0.517204,0.432551
2,V3,Interview number,0.002731,0.001274,-0.001456
3,V5,Important in life: Friends,0.452394,0.426179,-0.026215
4,V6,Important in life: Leisure time,0.408156,0.359184,-0.048971
5,V7,Important in life: Politics,0.330967,0.321318,-0.009649
6,V8,Important in life: Work,0.734571,0.115420,-0.619152
7,V9,Important in life: Religion,0.606772,0.106499,-0.500273
8,V10,Feeling of happiness,0.435827,0.441289,0.005461
9,V11,State of health (subjective),0.417987,0.397415,-0.020572


In [26]:
# X_new = SelectKBest(chi2, k=10).fit_transform(X, y)
# X_new.shape

In [27]:
# named_df = locate_col_lables(X_new, df)

In [28]:
# get_col_labels(named_df, codebook)

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.dummy import DummyClassifier

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X_new, y, test_size=0.33, random_state=42)

clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.517203713817586

### Working

In [45]:
from sklearn import metrics

test_var = 'V2A'

y = use_df[test_var]
X = use_df.drop(test_var, axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

clf = GaussianNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

# expected = y_test
# predicted = clf.predict(X_test)

# print(metrics.classification_report(expected, predicted))
# print(metrics.confusion_matrix(expected, predicted))

1.0

In [46]:
X.shape

(16644, 218)

In [47]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X_new = SelectKBest(chi2, k=10).fit_transform(X, y)
X_new.shape

  if np.issubdtype(mask.dtype, np.int):


(16644, 10)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X_new, y, test_size=0.33, random_state=42)

clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.3007464045148371