In [3]:
#SVM
#Boosting
#Decision Tree
#Neural Network

In [4]:
import numpy as np 
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn import svm 
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [5]:
#Reading the data
def type_to_numeric(x):
    if x == 'democrat':
        return 0
    if x == 'republican':
        return 1
    if x == 'n':
        return 0	
    if x == 'y':
        return 1

In [6]:
#Reading the data from the file
def read_data(file):

    data_file = pd.read_csv('congressional_data.txt', sep="\t", header = None)
    data_file.columns = ["Party","handicapped_infants","water_project_cost_sharing","adoption_of_the_budget_resolution","physician_fee_freeze",	"el_salvador_aid",	"religious_groups_in_schools",	"nti_satellite_test_ban","aid_to_nicaraguan_contras","mx_missile", "immigration","synfuels_corporation_cutback","education_spending","superfund_right_to_sue",	"crime","duty_free_exports","export_administration_act_sa"]

    columns = ["Party","handicapped_infants","water_project_cost_sharing","adoption_of_the_budget_resolution","physician_fee_freeze",	"el_salvador_aid",	"religious_groups_in_schools", "nti_satellite_test_ban","aid_to_nicaraguan_contras","mx_missile","immigration","synfuels_corporation_cutback","education_spending","superfund_right_to_sue",	"crime","duty_free_exports","export_administration_act_sa"]

    for column in columns:
        data_file[column] = data_file[column].apply(type_to_numeric)


    data_file_updated = data_file.dropna()
    
    
    data = data_file_updated[["handicapped_infants",	"water_project_cost_sharing",	"adoption_of_the_budget_resolution",	
        "physician_fee_freeze",	"el_salvador_aid",	"religious_groups_in_schools",	"nti_satellite_test_ban",	
        "aid_to_nicaraguan_contras",	"mx_missile",	"immigration",	"synfuels_corporation_cutback",	
        "education_spending",	"superfund_right_to_sue",	"crime",	
        "duty_free_exports",	"export_administration_act_sa"]]
    label = data_file_updated["Party"]

    return np.array(data), np.array(label)    


In [7]:
#Accuracy of the results
def accuracy(true_class, pred_class):           #Calculating the accuracy of the classification
    correct = 0
    for x in range(len(true_class)):
        if true_class[x] == pred_class[x]:
            correct += 1
    return (correct/float(len(true_class))) * 100.0

In [8]:
# initialize runtime
start = time.clock()

#Reading the data from the txt file( Two variables)
data, label = read_data("datafile.txt")

#Splitting the data
dtrain, data_tc, dtr_label, dtc_label = train_test_split(data, label, test_size=0.20, random_state=1)
dtest, dval, dtest_label, dval_label = train_test_split(data_tc,dtc_label, test_size=0.25, random_state=1)



In [9]:
#Support Vector Machine Algorithm

#Validation Step
max_score = 0
max_reg = 0
max_kernel = 0
kernels = ["linear","rbf","poly"]
for v in range(1,5):
    for k in kernels:
        SVM = svm.SVC(C= v, kernel = k)
        SVM.fit(dtrain, dtr_label)
        if(SVM.score(dval,dval_label) > max_score):
            max_score = SVM.score(dval,dval_label)
            max_reg = v
            max_kernel = k


#Fitting the test data
SVM = svm.SVC(C= max_reg, kernel = max_kernel)
fit = SVM.fit(dtrain,dtr_label)		

#Predicting the data test into class
true_class_svm = dtest_label
pred_class_svm = SVM.predict(dtest)

#Assesing the accuracy of the predicted class
svm_acc = accuracy(true_class_svm,pred_class_svm)

#Printing the accuracy of prediction
print( "The accuracy of the prediction when SVM is used: ", svm_acc )
print("Error Performance when SVM is used: ", 100-svm_acc)


('The accuracy of the prediction when SVM is used: ', 100.0)
('Error Performance when SVM is used: ', 0.0)


In [10]:
#Adaboost Algorithm

#Validation Step
max_score = 0
max_estimators = 0
max_learningrate = 0
for e in range(50,600,50):
    for l in range(1,4,1):
        adaboost = AdaBoostClassifier(n_estimators= e, learning_rate= l)
        adaboost.fit(dtrain, dtr_label)
        if(adaboost.score(dval,dval_label) > max_score):
            max_score = adaboost.score(dval,dval_label)
            max_estimators = e
            max_learningrate = l

#Fitting the test data
adaboost = AdaBoostClassifier(n_estimators = max_estimators, learning_rate = max_learningrate)
fit = adaboost.fit(dtrain,dtr_label)		


#Predicting the data test into class
true_class_adaboost = dtest_label
pred_class_adaboost = adaboost.predict(dtest)

#Assesing the accuracy of the predicted class
adaboost_acc = accuracy(true_class_adaboost,pred_class_adaboost)

#Printing the accuracy of prediction
print( "The accuracy of the prediction when Adaboost is used: ", adaboost_acc )
print("Error Performance when Adaboost is used: ", 100-adaboost_acc)



('The accuracy of the prediction when Adaboost is used: ', 94.28571428571428)
('Error Performance when Adaboost is used: ', 5.714285714285722)


In [12]:
#Decision Tree Algorithm

#Validation Step
max_score = 0
max_depth = 0
for d in range(1,10):
    dtc = DecisionTreeClassifier(max_depth = d)
    dtc.fit(dtrain, dtr_label)
    if(dtc.score(dval,dval_label) > max_score):
        max_score = dtc.score(dval,dval_label)
        max_depth = d


#Fitting the test data
dtc = DecisionTreeClassifier(max_depth= max_depth)
fit = dtc.fit(dtrain,dtr_label)		

#Predicting the data test into class
true_class_dtc = dtest_label
pred_class_dtc = dtc.predict(dtest)

#Assesing the accuracy of the predicted class
dtc_acc = accuracy(true_class_dtc,pred_class_dtc)

#Printing the accuracy of prediction
print ("The accuracy of the prediction when Decision Tree Classifier is used: ", dtc_acc) 
print ("Error Performance when Decision Tree Classifier is used: ", 100-dtc_acc)


('The accuracy of the prediction when Decision Tree Classifier is used: ', 100.0)
('Error Performance when Decision Tree Classifier is used: ', 0.0)


In [13]:
#Neural Network

#Validation Step
max_score = 0
max_hidden_layers = 0
max_alpha = 0
max_solver = 0
solvers = ["lbfgs", "sgd", "adam"] 
for h in range(5,100,5):
    for s in solvers:
        neural = MLPClassifier(solver= s, hidden_layer_sizes=(h,))
        neural.fit(dtrain, dtr_label)
        if(neural.score(dval,dval_label) > max_score):
            max_score = neural.score(dval,dval_label)
            max_hidden_layers = h
            max_solver = s


#Fitting the test data
neural = MLPClassifier(solver = max_solver, hidden_layer_sizes=(max_hidden_layers,))
fit = neural.fit(dtrain,dtr_label)		

#Predicting the data test into class
true_class_neural = dtest_label
pred_class_neural = neural.predict(dtest)

#Assesing the accuracy of the predicted class
neural_acc = accuracy(true_class_neural,pred_class_neural)

#Printing the accuracy of prediction
print( "The accuracy of the prediction when Neural Network Classifier is used: ", neural_acc )
print("Error Performance when Neural Network Classifier is used: ", 100-neural_acc)





('The accuracy of the prediction when Neural Network Classifier is used: ', 91.42857142857143)
('Error Performance when Neural Network Classifier is used: ', 8.57142857142857)


In [None]:
"""
Inferences and Conclusions:
    
Congressional Dataset consists of 435 instances with 16 variables. It consists of Missing values, and upon 
filtration we end up with 232 instances.

We have used four different models to learn the data and perform the prediction task. 
They are:

Support Vector Machines(SVM)
Boosting(Adaboost Algorithm)
CART (Decision Tree Algorithm)
Neural Network

In the Support Vector Machines algorithm, we have used Regularisation Parameter and type of Kernel as the varying 
factor for the validation step and we performed the analysis. On checking with different results, we have observed
that prediction rate is highest for Linear Classifier and unit regularisation parameter. While, as we moved to other
kernels and higher regularisation parameters, it reduced.

When we come to Adaboost model, we have used learning rate and number of estimators as the varying factor. 
Decision Tree Classifier remained as its default model and SAMME as its default algorithm. In this algorithm, we 
have observed that for higher values of number of estimators and learning rate the prediction accuracy is less.
So for the values of 50 estimators and learning rate to be 1. We have the highest prediction accuracy.

In case Decision tree classifier, we have used maximum depth as the varying factor and came to know that
for the smaller value of smaller depth, we have observed a highest prediction rate.

Ultimately, for neural networks we have used the type of solver and number of hidden layers as the varying factor
and we have observed that the solver "lgfbs" and for 10 hidden layers we are getting the highest accuracy.

"""