In [1]:
# Import modules
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

In [2]:
df_votes = pd.read_csv('house-votes-84.data.csv')
df_votes.head()

Unnamed: 0,Class name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [3]:
#
df_votes_replace=df_votes.replace({'n': 0, 'y': 1, '?': np.nan})
df_votes_replace.head()

Unnamed: 0,Class name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,,1.0,1.0,1.0,0.0,1.0
1,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,
2,democrat,,1.0,1.0,,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,democrat,0.0,1.0,1.0,0.0,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,democrat,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,,1.0,1.0,1.0,1.0


In [4]:
#discard instances that have missing feature values
df_votes1=df_votes_replace.dropna()
df_votes1.head()

Unnamed: 0,Class name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
5,democrat,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
8,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
19,democrat,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
23,democrat,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25,democrat,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [5]:
#treat “missing” as if it is a value (and thus a binary feature becomes a ternary, or three-valued, feature
df_votes2=df_votes_replace.replace({np.nan:2})
df_votes2.head()

Unnamed: 0,Class name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0
1,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0
2,democrat,2.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,democrat,0.0,1.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,democrat,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0,1.0


In [6]:
#impute missing values (i.e., for each feature, replace missing values with the most common value for that feature), so that they are no longer missing or unknown
df_votes3 = df_votes_replace.apply(lambda x: x.fillna(x.value_counts().index[0]))
df_votes3.head()

Unnamed: 0,Class name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
1,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
2,democrat,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,democrat,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,democrat,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0


In [7]:
# Create a decision tree classifier
DecisionTreeClassifier = DecisionTreeClassifier(random_state=0)
    
# Create a Naive Bayes classifier
NaiveBayesclassifier = GaussianNB()

#Create a 5-fold cross validation
FiveFold = KFold(n_splits=5)

In [8]:
#Discard instances that have missing feature values
X1 = df_votes1.iloc[:, 1:].values # Features
y1 = df_votes1.iloc[:, 0].values # Labels

# Perform 5-fold cross validation and get predictions
DecisionTreeClassifier_predicts1 = cross_val_predict(DecisionTreeClassifier, X1, y1, cv=FiveFold)
NaiveBayesclassifier_predicts1 = cross_val_predict(NaiveBayesclassifier, X1, y1, cv=FiveFold)

# Report precision, recall, and F1-scores
print(f'Discard instances that have missing feature values:')
print("Decision tree Precision: {:.2f}".format(precision_score(y1, DecisionTreeClassifier_predicts1, pos_label='democrat')))
print("Decision tree Recall: {:.2f}".format(recall_score(y1, DecisionTreeClassifier_predicts1, pos_label='democrat')))
print("Decision tree F1-score: {:.2f}".format(f1_score(y1, DecisionTreeClassifier_predicts1, pos_label='democrat')))
print("Naïve Bayes Precision: {:.2f}".format(precision_score(y1, NaiveBayesclassifier_predicts1, pos_label='democrat')))
print("Naïve Bayes Recall: {:.2f}".format(recall_score(y1, NaiveBayesclassifier_predicts1, pos_label='democrat')))
print("Naïve Bayes F1-score: {:.2f}".format(f1_score(y1, NaiveBayesclassifier_predicts1, pos_label='democrat')))

Discard instances that have missing feature values:
Decision tree Precision: 0.96
Decision tree Recall: 0.94
Decision tree F1-score: 0.95
Naïve Bayes Precision: 0.95
Naïve Bayes Recall: 0.96
Naïve Bayes F1-score: 0.96


In [9]:
#Treat “missing” as if it is a value (and thus a binary feature becomes a ternary, or three-valued, feature
X2 = df_votes2.iloc[:, 1:].values # Features
y2 = df_votes2.iloc[:, 0].values # Labels

# Perform 5-fold cross validation and get predictions
DecisionTreeClassifier_predicts2 = cross_val_predict(DecisionTreeClassifier, X2, y2, cv=FiveFold)
NaiveBayesclassifier_predicts2 = cross_val_predict(NaiveBayesclassifier, X2, y2, cv=FiveFold)

# Report precision, recall, and F1-scores
print(f'Treat “missing” as if it is a value (and thus a binary feature becomes a ternary, or three-valued, feature:')
print("Decision tree Precision: {:.2f}".format(precision_score(y2, DecisionTreeClassifier_predicts2, pos_label='democrat')))
print("Decision tree Recall: {:.2f}".format(recall_score(y2, DecisionTreeClassifier_predicts2, pos_label='democrat')))
print("Decision tree F1-score: {:.2f}".format(f1_score(y2, DecisionTreeClassifier_predicts2, pos_label='democrat')))
print("Naïve Bayes Precision: {:.2f}".format(precision_score(y2, NaiveBayesclassifier_predicts2, pos_label='democrat')))
print("Naïve Bayes Recall: {:.2f}".format(recall_score(y2, NaiveBayesclassifier_predicts2, pos_label='democrat')))
print("Naïve Bayes F1-score: {:.2f}".format(f1_score(y2, NaiveBayesclassifier_predicts2, pos_label='democrat')))

Treat “missing” as if it is a value (and thus a binary feature becomes a ternary, or three-valued, feature:
Decision tree Precision: 0.96
Decision tree Recall: 0.97
Decision tree F1-score: 0.96
Naïve Bayes Precision: 0.95
Naïve Bayes Recall: 0.95
Naïve Bayes F1-score: 0.95


In [10]:
#impute missing values (i.e., for each feature, replace missing values with the most common value for that feature), so that they are no longer missing or unknown
X3 = df_votes3.iloc[:, 1:].values # Features
y3 = df_votes3.iloc[:, 0].values # Labels

# Perform 5-fold cross validation and get predictions
DecisionTreeClassifier_predicts3 = cross_val_predict(DecisionTreeClassifier, X3, y3, cv=FiveFold)
NaiveBayesclassifier_predicts3 = cross_val_predict(NaiveBayesclassifier, X3, y3, cv=FiveFold)

# Report precision, recall, and F1-scores
print(f'Impute missing values:')
print("Decision tree Precision: {:.2f}".format(precision_score(y3, DecisionTreeClassifier_predicts3, pos_label='democrat')))
print("Decision tree Recall: {:.2f}".format(recall_score(y3, DecisionTreeClassifier_predicts3, pos_label='democrat')))
print("Decision tree F1-score: {:.2f}".format(f1_score(y3, DecisionTreeClassifier_predicts3, pos_label='democrat')))
print("Naïve Bayes Precision: {:.2f}".format(precision_score(y3, NaiveBayesclassifier_predicts3, pos_label='democrat')))
print("Naïve Bayes Recall: {:.2f}".format(recall_score(y3, NaiveBayesclassifier_predicts3, pos_label='democrat')))
print("Naïve Bayes F1-score: {:.2f}".format(f1_score(y3, NaiveBayesclassifier_predicts3, pos_label='democrat')))

Impute missing values:
Decision tree Precision: 0.96
Decision tree Recall: 0.94
Decision tree F1-score: 0.95
Naïve Bayes Precision: 0.95
Naïve Bayes Recall: 0.93
Naïve Bayes F1-score: 0.94
