# Coding Discussion 5

In [1]:
#importing packages
import pandas as pd
import pprint as pp
import scipy.stats as st

In [2]:
#importing data
turnout=pd.read_csv("/Users/carolineadams/coding_discussions_ppol564_fall2021/05_coding_discussion/turnout.csv")

In [3]:
#displaying the first five rows of the data
turnout.head()

Unnamed: 0,id,age,educate,income,vote,white
0,1,60,14.0,3.3458,1,1
1,2,51,10.0,1.8561,0,1
2,3,24,12.0,0.6304,0,1
3,4,38,8.0,3.4183,1,1
4,5,25,12.0,2.7852,1,1


In [4]:
#creating the train data (80% of the original dataset)
train = turnout.sample(frac=.8).reset_index(drop=True)
#creating the test data (20% of the original dataset)
test = turnout.drop(train.index).reset_index(drop=True)

# Print off the split count 
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])

#displaying the first five rows of the training data
train.head()

Training Data: 1600 
Test Data: 400


Unnamed: 0,id,age,educate,income,vote,white
0,331,62,6.0,2.917,1,1
1,739,45,19.0,7.8949,1,1
2,1839,40,13.0,0.4607,0,1
3,481,41,9.0,4.0702,1,1
4,1351,57,8.0,1.7723,1,1


In [5]:
#calculating class probabilities

#counting the number of observations
N=train.shape[0]

##subset the data by class
vote1 = train.query("vote == 1")
vote0 = train.query("vote == 0")

#calculate the probability for each class
pr_vote_1 = vote1.shape[0]/N
pr_vote_0 = vote0.shape[0]/N

#Print the probabilities
print(
f"""
Pr(vote = 1): {pr_vote_1}
Pr(vote = 0): {pr_vote_0}
""")



Pr(vote = 1): 0.7525
Pr(vote = 0): 0.2475



## Part 1. Discrete Data
A naive Bayes classifier that predicts whether or not a person voted based on discrete outcome contained in the dataset was created. The discrete variable was whether or not a person was white. The classifier is based on an assumption that each variable is independent of the others in the model. Class probabilities were calculated; the probability that someone voted was 0.739375 and the probability that someone did not vote was 0.260625. Conditional probabilities were calculated for each class; whether or not a person was white given that they did or did not vote.

The probabilities were multiplied together for each outcome to predict whether a given person would vote. This was done for each observation in the data. When run on the training data, the classifer was accurate approximately 75% of the time. When run on the test data, the classifer was accurate approximately 70% of the time, which was slightly less accurate than the training data, as expected. However, the classifer worked much better than a coin flip.

In [17]:
#defining a function that takes in a dataset and outcome variable and returns the class and conditional probabilities in the binary data
def calc_probs(data,outcome_var=""):
    '''
    Takes in a dataset and an outcome variable;
    returns the class and conditional probabilities in the binary data.
   
    Arguments
    --------------
    data : a dataset
    outcome_var: a string referring to the outcome variable
    
    Return
    --------------
    Returns the class and conditional probabilities in the binary data    
    '''
    # Generate empty dictionary containers.
    class_probs = {};cond_probs = {}
    # Locate all variables that are not the outcome.
    vars = [v for v in data.columns if v == 'white']
    # iterate through the class outcomes
    for y, d in data.groupby(outcome_var): 
        # calculate the class probabilities
        class_probs.update({y: d.shape[0]/data.shape[0]})
        for v in vars:
            # calculate the conditional probabilities for each variable given the class.
            pr = d[v].sum()/d.shape[0]
            cond_probs[(v,1,y)] = pr 
            cond_probs[(v,0,y)] = 1 - pr
    return class_probs, cond_probs


#running the function on the training data
class_probs, cond_probs = calc_probs(train,outcome_var="vote")

#printing the class and conditional probabilities
print("class probabilities",end="\n\n")
pp.pprint(class_probs)
print("\n")
print("conditional probabilities",end="\n\n")
pp.pprint(cond_probs)

class probabilities

{0: 0.2475, 1: 0.7525}


conditional probabilities

{('white', 0, 0): 0.202020202020202,
 ('white', 0, 1): 0.12624584717607978,
 ('white', 1, 0): 0.797979797979798,
 ('white', 1, 1): 0.8737541528239202}


In [18]:
#creating a subset of the train data with only discrete variables
discrete_dta_train=train[['vote', "white"]]

#defining a function that predicts the most likely class based on conditional probabilities
def predict(data,class_probs,cond_probs):
    '''
    Takes in a dataset, class and conditional probabilities;
    calculates the conditional probability for membership in each class;
    returns both the probabilities and the most likely class.
   
    Arguments
    --------------
    data : a dataset
    class_probs: class probabilities for binary data
    cond_probs: conditional probabilities for binary data
    
    Return
    --------------
    Returns the conditional probability for membership in each class 
    and the most likely class.   
    '''
    store_preds = [] #create an empty list
    for i,row in data.iterrows():  #iterate through rows in dataset and calculate conditional probabilities
        pr_1 = 1; pr_0 = 1
        for j in range(1,len(row.index)):
            pr_0 *= cond_probs[(row.index[j],row.values[j],0)]
            pr_1 *= cond_probs[(row.index[j],row.values[j],1)]     
        pr_0 *= class_probs[0]
        pr_1 *= class_probs[1]
        store_preds.append([pr_0,pr_1,max([(pr_0,0),(pr_1,1)])[1]])  #append probabilities and class with highest probability to list
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])  #return list information as a dataframe

#Run the function on the train data
preds = predict(discrete_dta_train, class_probs, cond_probs)
#display the first five rows of the training data predictions
preds.head()

Unnamed: 0,pr_0,pr_1,pred
0,0.1975,0.6575,1
1,0.1975,0.6575,1
2,0.1975,0.6575,1
3,0.1975,0.6575,1
4,0.1975,0.6575,1


In [19]:
#calculating the accuracy of the classifer on the training data
train_accuracy_discrete = sum(discrete_dta_train.vote == preds.pred)/discrete_dta_train.shape[0]
train_accuracy_discrete

0.7525

In [20]:
#creating a subset of the test data for discrete variables only
discrete_dta_test=test[['vote', "white"]]
#testing the classifer on the test data
test_preds_discrete = predict(discrete_dta_test, class_probs, cond_probs)
#calculating the accuracy of the classifer on the test data
test_accuracy_discrete = sum(discrete_dta_test.vote == test_preds_discrete.pred)/discrete_dta_test.shape[0]
test_accuracy_discrete

0.7

## Part 2: Continuous Variables
A naive Bayes classifier that predicts whether or not a person voted based on the continuous variables in the dataset (age, education level, income) was created. Again, the classifier is based on an assumption that each variable is independent of the others in the model. Class probabilities calculated above were used; the probability that someone voted was 0.739375 and the probability that someone did not vote was 0.260625. The probability density function for the normal distribution was used to convert the continuous variables into probabilities. The conditional means and standard deviations were calculated for each value of the outcomes. These were then used to calculate predictions for all observations in the dataset.

The probabilities were multiplied together for each outcome to predict whether a given person would vote. This was done for each observation in the data. When run on the training data, the classifer was accurate approximately 75% of the time. When run on the test data, the classifer was accurate approximately 72% of the time, which was slightly less accurate than the training data, as expected. However, the classifer worked much better than a coin flip.

In [9]:
#creating a subset of the data with only continuous variables and the outcome variable
cont_dta_train=train[['vote', "age", "educate", "income"]]

In [10]:
#Collecting the mean and standard deviation of each conditional distribution
dist_locs = \
{("age",1):{'mean':vote1.age.mean(),'sd':vote1.age.std()},
 ("age",0):{'mean':vote0.age.mean(),'sd':vote0.age.std()},
 ("educate",1):{'mean':vote1.educate.mean(),'sd':vote1.educate.std()},
 ("educate",0):{'mean':vote0.educate.mean(),'sd':vote0.educate.std()},
 ("income",1):{'mean':vote1.income.mean(),'sd':vote1.income.std()},
 ("income",0):{'mean':vote0.income.mean(),'sd':vote0.income.std()}
}

#printing the mean and standard deviation and each conditional distribution
pp.pprint(dist_locs)

{('age', 0): {'mean': 42.98232323232323, 'sd': 19.451652252403164},
 ('age', 1): {'mean': 46.41860465116279, 'sd': 16.756111359048045},
 ('educate', 0): {'mean': 10.723484848484848, 'sd': 3.191353663390057},
 ('educate', 1): {'mean': 12.568936877076412, 'sd': 3.298060872523699},
 ('income', 0): {'mean': 2.9112669191919194, 'sd': 2.2969743546380554},
 ('income', 1): {'mean': 4.289075249169438, 'sd': 2.95348997768988}}


In [21]:
#defining a function that predicts the most likely class based on conditional probabilities, using continuous variables
def predict(data,dist_locs):
    '''
    Takes in a dataset, and a dictionary of mean and standard deviations for conditional distributions;
    calculates the conditional probability for membership in each class;
    returns both the probabilities and the most likely class.
   
    Arguments
    --------------
    data : a dataset
    dist_locs: dictionary of meanas and standard deviations for conditional distributions
    
    Return
    --------------
    Returns the conditional probability for membership in each class 
    and the most likely class.   
    '''
    store_preds = []  #create an empty list
    for i,row in data.iterrows():  #iterate through rows in data
        
        #get the predictions using a Gaussan distribution
        pr_0 = 1; pr_1 = 1
        for j in range(1,len(row)):
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
        pr_0 *= pr_vote_0
        pr_1 *= pr_vote_1
        
        #assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])  #append probabilities and prediction to list
        
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])  #return list as dataframe

#Running the classifer on the continuous training data
preds_train_cont = predict(cont_dta_train,dist_locs)

In [22]:
#displaying the head of continuous training prediction data
preds_train_cont.head()

Unnamed: 0,pr_0,pr_1,pred
0,2.285359e-05,2.3e-05,1
1,3.607385e-07,2.1e-05,1
2,4.780346e-05,0.000116,1
3,8.343578e-05,0.000154,1
4,5.222912e-05,6.4e-05,1


In [23]:
#calculating the percentage of the time that the classifer accurately predicted whether or not someone voted
accuracy_train_cont = sum(train.vote == preds_train_cont.pred)/cont_dta_train.shape[0]
accuracy_train_cont

0.755625

In [24]:
#creating a subset of the test data with only continuous variables and the outcome variable
cont_dta_test=test[['vote', "age", "educate", "income"]]

In [25]:
#running the predict function on the test data
preds_test_cont = predict(cont_dta_test,dist_locs)
#calculating the percentage of the time that the classifer accurately predicted whether or not someone voted
accuracy_test = sum(cont_dta_test.vote == preds_test_cont.pred)/cont_dta_test.shape[0]
accuracy_test

0.7225