In [34]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

train_path = "res/CongressionalVotingID.shuf.train.csv"
test_path = "res/CongressionalVotingID.shuf.test.csv"
result_folder = "results/"

test_set_ratio = 0.1 

In [30]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

df_train = df_train.replace({'y':True, 'n':False, 'unknown':np.nan})
df_test = df_test.replace({'y':True, 'n':False, 'unknown':np.nan})

df_train.head(10)

Unnamed: 0,ID,class,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,134,republican,False,False,False,True,True,True,False,False,False,True,False,True,True,True,False,True
1,224,republican,False,False,False,True,True,False,False,False,False,False,False,True,True,True,False,True
2,32,democrat,True,True,True,False,False,False,True,True,True,False,True,False,False,False,True,
3,171,democrat,True,False,True,False,False,False,,True,True,,False,False,False,False,True,
4,333,democrat,True,False,True,False,False,False,True,True,True,False,True,False,False,False,True,
5,148,democrat,False,False,False,False,False,False,True,True,True,True,False,True,True,True,True,True
6,83,republican,False,False,False,True,True,True,False,False,False,True,False,True,True,True,False,True
7,24,democrat,True,True,True,False,False,False,True,True,True,False,False,False,False,False,True,True
8,281,democrat,False,False,True,False,False,False,True,True,True,True,False,False,False,True,False,True
9,416,democrat,False,True,True,False,False,True,True,True,,True,False,False,False,False,False,True


### Initialize Training data

In [38]:
df_vars = df_train.loc[:, df_train.columns != 'class']
df_class = df_train['class']

# impute nans
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(df_vars)
df_vars = imp.transform(df_vars)
df_test = imp.transform(df_test)

X_train, X_test, y_train, y_test = train_test_split(df_vars, df_class, test_size=test_set_ratio, random_state=3124132)
clf = RandomForestClassifier(n_estimators=200, random_state=123093234)
#clf = SVC(kernel='linear', random_state=123093234)

### Simple cross validation for well-founded results

In [39]:
scores = cross_validate(clf, df_vars, df_class, cv=10)
scores['test_score']

array([0.91304348, 1.        , 0.86363636, 0.95454545, 0.95454545,
       0.95454545, 1.        , 0.9047619 , 1.        , 1.        ])

### Model fitting and sample testing

In [40]:
model = clf.fit(X_train, y_train)
model.score(X_test, y_test)

0.8636363636363636

### Final prediction

In [13]:
df_res = pd.DataFrame()
df_res['ID'] = df_test[:,0].astype(int)
df_res['class'] = pd.Series(model.predict(df_test))
df_res.head(10)

Unnamed: 0,ID,class
0,368,democrat
1,15,republican
2,94,democrat
3,107,republican
4,285,democrat
5,53,democrat
6,138,democrat
7,265,democrat
8,419,democrat
9,226,republican


### Export

In [14]:
export_file_name = result_folder + "svm_poly_mean_imputed.csv"
df_res.to_csv(export_file_name, index=False)

### Easy comparison

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest",  "Random Forest (Max 1 Feature)", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(),
    SVC(kernel="linear"),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    RandomForestClassifier(max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

X_train, X_test, y_train, y_test = train_test_split(df_vars, df_class, test_size=.2, random_state=3124132)

for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(name, score)

Nearest Neighbors 0.5227272727272727
Linear SVM 0.9772727272727273
RBF SVM 0.5
Gaussian Process 0.5
Decision Tree 0.9318181818181818
Random Forest 0.9090909090909091
Random Forest (Max 1 Feature) 0.8409090909090909
Neural Net 0.5




AdaBoost 0.9772727272727273
Naive Bayes 0.9318181818181818
QDA 0.9090909090909091
