Peijin_coding discussion5

In [1]:
import pandas as pd
import numpy as np
import pprint as pp # for printing
import scipy.stats as st # for Normal PDF
# Silence warnings 
import warnings
warnings.filterwarnings("ignore")

In [2]:
turnout_data = pd.read_csv("turnout.csv")# read in data

In [3]:
# Train-Test split (just using Pandas)
train = turnout_data.sample(frac=.8).reset_index(drop=True)
test = turnout_data.drop(train.index).reset_index(drop=True)

In [4]:
# Print off the split count 
print("Training Data:",train.shape[0],
      "\nTest Data:",test.shape[0])
# Look at the head of the data
train.head()

Training Data: 1600 
Test Data: 400


Unnamed: 0,id,age,educate,income,vote,white
0,1239,50,12.0,11.8939,0,1
1,251,23,12.0,2.522,1,1
2,93,30,15.0,7.7751,1,1
3,756,55,13.0,8.7545,1,1
4,574,26,8.0,2.1309,1,1


Coding below follows these five steps:
1.	Calculating class probabilities, the proportion of people vote and not vote;
2.	Calculating conditional probabilities, including the probabilities of three continuous variables(age, educate, income) and one dichotomous variable(white);
3.	Multiplying conditional probabilities and predicting a single observation to test;
4.	Conducting a function to predicting multiple observations;
5.	Examining the predictive accuracy. 


Calculate Class Probabilities: Pr(class)

In [5]:
N = train.shape[0]##count the number of the obs
vote1 = train.query("vote == 1")# Subset the data by class
vote0 = train.query("vote == 0")
pr_vote1 = vote1.shape[0]/N# Calculate the probability for each class
pr_vote0 = vote0.shape[0]/N

# Print the probabilities
print(
f"""
Pr(vote = 1): {pr_vote1}
Pr(vote = 0): {pr_vote0}
""")


Pr(vote = 1): 0.74125
Pr(vote = 0): 0.25875



Calculate the Conditional Probabilities Pr(data|class)

In [6]:
##Calculate the Conditional Probabilities of each category 
# Given vote == 1
w1_vote1 = vote1.query("white == 1").shape[0]/vote1.shape[0]
w0_vote1 = vote1.query("white == 0").shape[0]/vote1.shape[0]

# Given vote == 0
w1_vote0 = vote0.query("white == 1").shape[0]/vote0.shape[0]
w0_vote0 = vote0.query("white == 0").shape[0]/vote0.shape[0]

print(
f"""
Pr(white  = 1 |vote = 1): {w1_vote1}
Pr(white  = 0 |vote = 1): {w0_vote1}
Pr(white  = 1 |vote = 0): {w1_vote0}
Pr(white  = 0 |vote = 0): {w0_vote0}
""")


Pr(white  = 1 |vote = 1): 0.8768971332209107
Pr(white  = 0 |vote = 1): 0.12310286677908938
Pr(white  = 1 |vote = 0): 0.7922705314009661
Pr(white  = 0 |vote = 0): 0.20772946859903382



Calculate the conditional means/standard deviations

In [7]:
# Collect the mean and standard dev. of each conditional distribution
dist_locs = \
{("age",1):{'mean':vote1.age.mean(),'sd':vote1.age.std()},
 ("age",0):{'mean':vote0.age.mean(),'sd':vote1.age.std()},
 ("educate",1):{'mean':vote1.educate.mean(),'sd':vote1.educate.std()},
 ("educate",0):{'mean':vote0.educate.mean(),'sd':vote0.educate.std()},
 ("income",1):{'mean':vote1.income.mean(),'sd':vote1.income.std()},
 ("income",0):{'mean':vote0.income.mean(),'sd':vote0.income.std()}
}
pp.pprint(dist_locs)# Print

{('age', 0): {'mean': 42.78019323671498, 'sd': 16.978818227174425},
 ('age', 1): {'mean': 46.42664418212479, 'sd': 16.978818227174425},
 ('educate', 0): {'mean': 10.785024154589372, 'sd': 3.2227126579837133},
 ('educate', 1): {'mean': 12.603709949409781, 'sd': 3.322058189262507},
 ('income', 0): {'mean': 2.8050637681159425, 'sd': 2.233623471467791},
 ('income', 1): {'mean': 4.279906492411462, 'sd': 2.9359841296004734}}


Predicting a single observation

In [8]:
code,age,educate,income,vote,white  = train.iloc[1,:]##take the first ob as an example 
print(code,age,educate,income,vote,white )

251.0 23.0 12.0 2.522 1.0 1.0


In [9]:
# Prediction for the 1 class
a = st.norm(dist_locs[("age",1)]['mean'], dist_locs[("age",1)]['sd']).pdf(age)
b = st.norm(dist_locs[("educate",1)]['mean'], dist_locs[("educate",1)]['sd']).pdf(educate)
c = st.norm(dist_locs[("income",1)]['mean'], dist_locs[("income",1)]['sd']).pdf(income)
d = w1_vote1 
e = pr_vote1
pr_1 = a * b * c * d * e

# Prediction for the 0 class
a = st.norm(dist_locs[("age",0)]['mean'], dist_locs[("age",0)]['sd']).pdf(age)
b = st.norm(dist_locs[("educate",0)]['mean'], dist_locs[("educate",0)]['sd']).pdf(educate)
c = st.norm(dist_locs[("income",0)]['mean'], dist_locs[("income",0)]['sd']).pdf(income)
d = w1_vote0
e = pr_vote0
pr_0 =  a * b * c * d * e

print(
f'''
    Pr(vote == 1| X): {pr_1}
    Pr(vote == 0| X): {pr_0}
''')


    Pr(vote == 1| X): 7.909843320187016e-05
    Pr(vote == 0| X): 4.9920653367261156e-05



 0.00011 is greater than 4.3575944751434353e-05, so we predict that vote=1

Predicting multiple observations

In [10]:
def predict(data,dist_locs):
    '''
    Function multiplys the conditional probabilities of the continuous variable with the conditional probabilities of the binary variable to predicte. The results is defined by the higher probability.
    -----
    Arguments:
    data is the dataset(with three continuous variables in the front of row ) need working on
    dist_locs is the dictionary containing the conditional probabilities of the continuous variables
    -----
    Results:
    A df containing the predicted result of each respondent will be returned 
    
    '''
    store_preds = []##create a null list to store data 
    for i,row in data.iterrows():
        
        # Get the predictions using a Gaussan distribution
        pr_0 = 1; pr_1 = 1##assign  pr_0 and pr_1 to a value 
        for j in range(1,4):##interate through the three continuous variables

            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
        #take the race(white==1) into consideration
        if(row["white"]==1):
            pr_0*=w1_vote0
        else:
            pr_0*=w0_vote0
    
        if(row["white"]==1):
            pr_1*=w1_vote1
        else:
            pr_1*=w0_vote1
        pr_0 *= pr_vote0##multiply conditional probabilities of all variables 
        pr_1 *= pr_vote1
        # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
        ## iterate through each observation 
        store_preds.append([pr_0,pr_1,class_pred])
        ##store the list as a df 
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","prediction"])

In [11]:
# Run this function on train dataset
preds_train = predict(train,dist_locs)

In [12]:
# Predicted output
preds_train.head(10)

Unnamed: 0,pr_0,pr_1,prediction
0,2.300313e-08,8e-06,1
1,4.992065e-05,7.9e-05,1
2,2.869141e-06,5.9e-05,1
3,1.869407e-06,6.8e-05,1
4,4.298307e-05,3.5e-05,0
5,1.658771e-05,1.4e-05,0
6,8.397865e-07,4.3e-05,1
7,4.765175e-05,0.000144,1
8,2.063641e-05,9.8e-05,1
9,1.501618e-08,1e-06,1


In [13]:
# Run this function on test dataset
preds_test = predict(test,dist_locs)

In [14]:
# Predicted output
preds_test.head(10)

Unnamed: 0,pr_0,pr_1,prediction
0,5.209828e-06,1.363811e-05,1
1,2.373622e-05,0.0001780097,1
2,2.847919e-05,6.752376e-05,1
3,8.391783e-05,0.0001789943,1
4,1.955891e-06,4.258618e-05,1
5,5.342983e-05,7.97875e-05,1
6,5.233246e-08,1.469802e-08,0
7,4.042522e-05,7.248681e-05,1
8,9.388557e-06,7.243149e-06,0
9,3.607851e-05,0.0001034091,1


Examine the predictive accuracy of the training data.

In [15]:
accuracy_train = sum(train.vote == preds_train.prediction)/train.shape[0]##Examine the predictive accuracy of the training data.
accuracy_train

0.73625

Examine the predictive accuracy on the test data.

In [16]:
accuracy_test = sum(test.vote == preds_test.prediction)/test.shape[0]##Examine the predictive accuracy on the test data.
accuracy_test

0.705

In a nut shell, with an accuracy of 70.5% on the test data, the prediction performs better than chance(coin flip), which would be 50%.