In [274]:
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
import numpy as np
from sklearn.tree import DecisionTreeClassifier as DTC
import statsmodels.api as sm
from pandas.core import datetools

In [275]:
def read_train_val_test(train_path, val_path, test_path):
    train_data = pd.read_csv(train_path).reset_index()
    val_data = pd.read_csv(val_path).reset_index()
    test_data = pd.read_csv(test_path).reset_index()
    return train_data, val_data, test_data

In [276]:
senate_name_dict = {}
house_name_dict = {}
def enumerate_districts(df, type):
    names = df['District']
    count = 0
    for name in names:
        if type == "Senate":
            if name not in senate_name_dict:
                senate_name_dict[name] = count
                count += 1
        elif type == "House":
            if name not in house_name_dict:
                house_name_dict[name] = count
                count += 1

In [277]:
def replace_district(x, type):
    if type == "Senate":
        return senate_name_dict[x]
    elif type == "House":
        return house_name_dict[x]

In [278]:
def clean_data(df, type):
    # make all text binary
    df = df.drop("name", axis=1)
    df['sex'].replace('f', 1, inplace=True)
    df['sex'].replace('m', 0, inplace=True)
    df['party'].replace('Democratic', 1, inplace=True)
    df['party'].replace('Republican', 0, inplace=True)
    
    # fill NaN's with mean from column
    df['sex'] = df['sex'].fillna(round(df['sex'].mean()))
    df['party'] = df['party'].fillna(df['party'].mean())
    df['Amount'] = df['Amount'].fillna(df['Amount'].mean())
    
    df['vote_count'] = df['vote_count'].apply(lambda x: str(x).replace(",", "").replace('nan', 'NaN')).astype(float)
    df['vote_count'] = df['vote_count'].fillna(df['vote_count'].mean())
    df['vote_percent'] = df['vote_percent'].fillna(df['vote_percent'].mean())
    df['District'] = df["District"].apply(lambda x: replace_district(x, type))
    
    return df

In [300]:
def classify_logistic_regression(train_df, val_df, test_df=None):
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    
    y_val = val_df['sex']
    X_val = val_df.drop('sex', axis=1)
    
    classifier = LR()
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_val)
    
    if test_df is not None:
        y_test = test_df['sex']
        X_test = test_df.drop('sex', axis=1)
        test_pred = classifier.predict(X_test)
        print("Test accuracy: {}".format(accuracy_score(y_test, test_pred)))
    
    return accuracy_score(y_val, pred)

In [301]:
def classify_svm(train_df, val_df, test_df=None):
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    
    y_val = val_df['sex']
    X_val = val_df.drop('sex', axis=1)
    
    classifier = SVC()
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_val)
    
    if test_df is not None:
        y_test = test_df['sex']
        X_test = test_df.drop('sex', axis=1)
        test_pred = classifier.predict(X_test)
        print("Test accuracy: {}".format(accuracy_score(y_test, test_pred)))
    
    
    return accuracy_score(y_val, pred)

In [310]:
def classify_knn(train_df, val_df, test_df=None):
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    
    y_val = val_df['sex']
    X_val = val_df.drop('sex', axis=1)
    accuracy = [0]*15
    for i in range(1, 16):
        classifier = KNN(n_neighbors=i)
        classifier.fit(X_train, y_train)
        pred = classifier.predict(X_val)
        accuracy[i-1] = accuracy_score(y_val, pred)
        
    if test_df is not None:
        classifier = KNN(n_neighbors=accuracy.index(max(accuracy)))
        classifier.fit(X_train, y_train)
        
        y_test = test_df['sex']
        X_test = test_df.drop('sex', axis=1)
        test_pred = classifier.predict(X_test)
        print("Test accuracy: {}".format(accuracy_score(y_test, test_pred)))
    return accuracy

In [303]:
def classify_dtc(train_df, val_df, test_df=None):
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    
    y_val = val_df['sex']
    X_val = val_df.drop('sex', axis=1)
    
    classifier_gini = DTC(random_state=40)
    classifier_entropy = DTC(criterion='entropy', random_state=40)
    
    classifier_gini.fit(X_train, y_train)
    classifier_entropy.fit(X_train, y_train)
    
    pred_gini = classifier_gini.predict(X_val)
    pred_entropy = classifier_entropy.predict(X_val)
    
    if test_df is not None:
        y_test = test_df['sex']
        X_test = test_df.drop('sex', axis=1)
        test_pred = classifier_gini.predict(X_test)
        print("Test accuracy: {}".format(accuracy_score(y_test, test_pred)))
        
    return accuracy_score(y_val, pred_gini), accuracy_score(y_val, pred_entropy)

In [304]:
def classify_ols(train_df, val_df):
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    
    y_val = val_df['sex']
    X_val = val_df.drop('sex', axis=1)
    
    classifier = sm.OLS(y_train, X_train)
    results = classifier.fit()
    pred = round(results.predict(X_val))
    return accuracy_score(y_val, pred)

In [305]:
senate_train_data, senate_val_data, senate_test_data = read_train_val_test("train_data/merged_senate_districts_2009_2012.csv", "valid_data/merged_senate_districts_2013_2014.csv", "test_data/merged_senate_districts_2015_2016.csv")
enumerate_districts(senate_train_data, "Senate")
enumerate_districts(senate_val_data, "Senate")
enumerate_districts(senate_test_data, "Senate")
print(senate_name_dict)
    
cleaned_train_senate = clean_data(senate_train_data,"Senate")
cleaned_val_senate = clean_data(senate_val_data,"Senate")
cleaned_test_senate = clean_data(senate_test_data,"Senate")

print("Logistic Regression: ")
print(classify_logistic_regression(cleaned_train_senate, cleaned_val_senate))
print("SVM: ")
print(classify_svm(cleaned_train_senate, cleaned_val_senate))
print("K-Nearest Neighbors: ")
print(classify_knn(cleaned_train_senate, cleaned_val_senate))
print("Decision Tree Classifier: ")
print(classify_dtc(cleaned_train_senate, cleaned_val_senate))
print("OLS Logistic Regression: ")
print(classify_ols(cleaned_train_senate, cleaned_val_senate))

{'Berkshire, Hampshire & Franklin': 0, 'Bristol & Norfolk': 1, 'Cape & Islands': 2, 'First Bristol & Plymouth': 3, 'First Essex': 4, 'First Essex & Middlesex': 5, 'First Hampden & Hampshire': 6, 'First Middlesex': 7, 'First Middlesex & Norfolk': 8, 'First Plymouth & Bristol': 9, 'First Suffolk': 10, 'First Suffolk & Middlesex': 11, 'First Worcester': 12, 'Fourth Middlesex': 13, 'Hampden': 14, 'Hampshire & Franklin': 15, 'Middlesex & Essex': 16, 'Middlesex & Worcester': 17, 'Middlesex, Suffolk & Essex': 18, 'Norfolk & Plymouth': 19, 'Norfolk, Bristol & Middlesex': 20, 'Norfolk, Bristol & Plymouth': 21, 'Plymouth & Barnstable': 22, 'Plymouth & Norfolk': 23, 'Second Bristol & Plymouth': 24, 'Second Essex': 25, 'Second Essex & Middlesex': 26, 'Second Hampden & Hampshire': 27, 'Second Middlesex': 28, 'Second Middlesex & Norfolk': 29, 'Second Plymouth & Bristol': 30, 'Second Suffolk': 31, 'Second Suffolk & Middlesex': 32, 'Second Worcester': 33, 'Suffolk & Norfolk': 34, 'Third Essex & Middle

In [306]:
house_train_data, house_val_data, house_test_data = read_train_val_test("train_data/merged_house_districts_2009_2012.csv", "valid_data/merged_house_districts_2013_2014.csv", "test_data/merged_house_districts_2015_2016.csv")
enumerate_districts(house_train_data, "House")
enumerate_districts(house_val_data, "House")
enumerate_districts(house_test_data, "House")
print(house_name_dict)
    
cleaned_train_house = clean_data(house_train_data,"House")
cleaned_val_house = clean_data(house_val_data,"House")
cleaned_test_house = clean_data(house_test_data,"House")

print("Logistic Regression: ")
print(classify_logistic_regression(cleaned_train_house, cleaned_val_house))
print("SVM: ")
print(classify_svm(cleaned_train_house, cleaned_val_house))
print("K-Nearest Neighbors: ")
print(classify_knn(cleaned_train_house, cleaned_val_house))
print("Decision Tree Classifier: ")
print(classify_dtc(cleaned_train_house, cleaned_val_house))
print("OLS Logistic Regression: ")
print(classify_ols(cleaned_train_house, cleaned_val_house))


{'10th Bristol': 0, '10th Essex': 1, '10th Hampden': 2, '10th Middlesex': 3, '10th Norfolk': 4, '10th Plymouth': 5, '10th Suffolk': 6, '10th Worcester': 7, '11th Bristol': 8, '11th Essex': 9, '11th Hampden': 10, '11th Middlesex': 11, '11th Norfolk': 12, '11th Plymouth': 13, '11th Suffolk': 14, '11th Worcester': 15, '12th Bristol': 16, '12th Essex': 17, '12th Hampden': 18, '12th Middlesex': 19, '12th Norfolk': 20, '12th Plymouth': 21, '12th Suffolk': 22, '12th Worcester': 23, '13th Bristol': 24, '13th Essex': 25, '13th Middlesex': 26, '13th Norfolk': 27, '13th Suffolk': 28, '13th Worcester': 29, '14th Bristol': 30, '14th Essex': 31, '14th Middlesex': 32, '14th Norfolk': 33, '14th Suffolk': 34, '14th Worcester': 35, '15th Essex': 36, '15th Middlesex': 37, '15th Norfolk': 38, '15th Suffolk': 39, '15th Worcester': 40, '16th Essex': 41, '16th Middlesex': 42, '16th Suffolk': 43, '16th Worcester': 44, '17th Essex': 45, '17th Middlesex': 46, '17th Suffolk': 47, '17th Worcester': 48, '18th Esse

In [307]:
print("Test Senate Decision Tree Classifier: ")
print(classify_dtc(cleaned_train_senate, cleaned_val_senate, cleaned_test_senate))

Test Senate Decision Tree Classifier: 
Test accuracy: 0.7176470588235294
(0.76829268292682928, 0.69512195121951215)


In [311]:
print("Test House KNN Classifier: ")
print(classify_knn(cleaned_train_house, cleaned_val_house, cleaned_test_house))

Test House KNN Classifier: 
Test accuracy: 0.7650602409638554
[0.72093023255813948, 0.76162790697674421, 0.70348837209302328, 0.75, 0.7441860465116279, 0.75, 0.73837209302325579, 0.76162790697674421, 0.76162790697674421, 0.76162790697674421, 0.7558139534883721, 0.76162790697674421, 0.76162790697674421, 0.76744186046511631, 0.76744186046511631]
