In [12]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from fancyimpute import KNN, IterativeImputer 

train_path = "res/CongressionalVotingID.shuf.train.csv"
test_path = "res/CongressionalVotingID.shuf.test.csv"
result_folder = "results/"

test_set_ratio = 0.1 

In [4]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

df_train = df_train.replace({'y':True, 'n':False, 'unknown':np.nan})
df_test = df_test.replace({'y':True, 'n':False, 'unknown':np.nan})

df_train.head(10)

Unnamed: 0,ID,class,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,134,republican,False,False,False,True,True,True,False,False,False,True,False,True,True,True,False,True
1,224,republican,False,False,False,True,True,False,False,False,False,False,False,True,True,True,False,True
2,32,democrat,True,True,True,False,False,False,True,True,True,False,True,False,False,False,True,
3,171,democrat,True,False,True,False,False,False,,True,True,,False,False,False,False,True,
4,333,democrat,True,False,True,False,False,False,True,True,True,False,True,False,False,False,True,
5,148,democrat,False,False,False,False,False,False,True,True,True,True,False,True,True,True,True,True
6,83,republican,False,False,False,True,True,True,False,False,False,True,False,True,True,True,False,True
7,24,democrat,True,True,True,False,False,False,True,True,True,False,False,False,False,False,True,True
8,281,democrat,False,False,True,False,False,False,True,True,True,True,False,False,False,True,False,True
9,416,democrat,False,True,True,False,False,True,True,True,,True,False,False,False,False,False,True


### Initialize Training data

In [14]:
df_vars = df_train.loc[:, df_train.columns != 'class']
df_class = df_train['class']

# impute nans
#imp = SimpleImputer(missing_values=np.nan, strategy='mean')
#imp.fit(df_vars)
#df_vars = imp.transform(df_vars)
#df_test = imp.transform(df_test)

df_vars = IterativeImputer().fit_transform(df_vars)
df_test = IterativeImputer().fit_transform(df_test)

X_train, X_test, y_train, y_test = train_test_split(df_vars, df_class, test_size=.2, random_state=3124132)
#clf = RandomForestClassifier(n_estimators=100, max_features=1)
#clf = SVC(kernel='linear', random_state=123093234)
clf = GaussianNB()

### Simple cross validation for well-founded results

In [15]:
scores = cross_validate(clf, df_vars, df_class, cv=10)
scores['test_score']

array([0.91304348, 0.95454545, 0.90909091, 1.        , 0.95454545,
       0.95454545, 0.95454545, 0.9047619 , 1.        , 1.        ])

### Model fitting and sample testing

In [16]:
model = clf.fit(X_train, y_train)
model.score(X_test, y_test)

0.9318181818181818

### Final prediction

In [17]:
df_res = pd.DataFrame()
df_res['ID'] = df_test[:,0].astype(int)
df_res['class'] = pd.Series(model.predict(df_test))
df_res.head(10)

Unnamed: 0,ID,class
0,368,democrat
1,15,republican
2,94,democrat
3,107,republican
4,285,democrat
5,53,democrat
6,138,democrat
7,265,democrat
8,419,democrat
9,226,republican


### Export

In [11]:
export_file_name = result_folder + "svm_knn_imputed.csv"
df_res.to_csv(export_file_name, index=False)

### Easy comparison

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

df_vars = df_train.loc[:, df_train.columns != 'class']
df_class = df_train['class']

df_vars = IterativeImputer().fit_transform(df_vars)

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest",  "Random Forest (Max 1 Feature)", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(),
    SVC(kernel="linear"),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
    RandomForestClassifier(n_estimators=100, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

X_train, X_test, y_train, y_test = train_test_split(df_vars, df_class, test_size=.2, random_state=3124132)

for name, clf in zip(names, classifiers):
    try:
        scores = cross_validate(clf, df_vars, df_class, cv=10)
        print(name, "Mean", scores['test_score'].mean(), "-- Min", scores['test_score'].min(), "-- Max", scores['test_score'].max())
    except:
        print("Classification failed for", name)

Nearest Neighbors Mean 0.5684453227931489 -- Min 0.45454545454545453 -- Max 0.6818181818181818
Linear SVM Mean 0.9590532655750048 -- Min 0.9090909090909091 -- Max 1.0
RBF SVM Mean 0.6284020327498588 -- Min 0.6086956521739131 -- Max 0.6363636363636364
Gaussian Process Mean 0.6284020327498588 -- Min 0.6086956521739131 -- Max 0.6363636363636364
Decision Tree Mean 0.9588556371165067 -- Min 0.8636363636363636 -- Max 1.0
Random Forest Mean 0.9635987201204592 -- Min 0.9047619047619048 -- Max 1.0
Random Forest (Max 1 Feature) Mean 0.9540749105966497 -- Min 0.9047619047619048 -- Max 1.0




Neural Net Mean 0.8050254093732354 -- Min 0.6086956521739131 -- Max 1.0
AdaBoost Mean 0.9820158102766798 -- Min 0.9090909090909091 -- Max 1.0
Naive Bayes Mean 0.9545078110295503 -- Min 0.9047619047619048 -- Max 1.0
QDA Mean 0.9406549971767364 -- Min 0.9047619047619048 -- Max 1.0
