In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
import numpy as np
from sklearn.tree import DecisionTreeClassifier as DTC
import statsmodels.api as sm
from pandas.core import datetools

  from pandas.core import datetools


In [2]:
def read_train_val_test(train_path, val_path, test_path):
    train_data = pd.read_csv(train_path).reset_index()
    val_data = pd.read_csv(val_path).reset_index()
    test_data = pd.read_csv(test_path).reset_index()
    return train_data, val_data, test_data

In [3]:
senate_name_dict = {}
house_name_dict = {}
name_dict = {}
def enumerate_districts(df, type=None):
    names = df['District']
    count = 0
    for name in names:
        if type == "Senate":
            if name not in senate_name_dict:
                senate_name_dict[name] = count
                count += 1
        elif type == "House":
            if name not in house_name_dict:
                house_name_dict[name] = count
                count += 1
        else:
            if name not in name_dict:
                name_dict[name] = count
                count += 1

In [4]:
def replace_district(x, type=None):
    if type == "Senate":
        return senate_name_dict[x]
    elif type == "House":
        return house_name_dict[x]
    else:
        return name_dict[x]

In [5]:
def clean_data(df, type=None):
    # make all text binary
    df = df.drop("name", axis=1)
    df['sex'].replace('f', 1, inplace=True)
    df['sex'].replace('m', 0, inplace=True)
    df['party'].replace('Democratic', 1, inplace=True)
    df['party'].replace('Republican', 0, inplace=True)
    
    # fill NaN's with mean from column
    df['sex'] = df['sex'].fillna(round(df['sex'].mean()))
    df['party'] = df['party'].fillna(df['party'].mean())
    df['Amount'] = df['Amount'].fillna(df['Amount'].mean())
    
    df['vote_count'] = df['vote_count'].apply(lambda x: str(x).replace(",", "").replace('nan', 'NaN')).astype(float)
    df['vote_count'] = df['vote_count'].fillna(df['vote_count'].mean())
    df['vote_percent'] = df['vote_percent'].fillna(df['vote_percent'].mean())
    df['District'] = df["District"].apply(lambda x: replace_district(x, type))
    
    # add indicator for female democrat
    df['female_dem'] = 0
    for index, row in df.iterrows():
        if row.sex == 1 and row.party == 1:
            df.set_value(index, 'female_dem', 1)
            
    df = df.iloc[:, [index for index, x in enumerate(df.columns) if 'Margin' not in x]].set_index("index")
    
    percent_cols = [col for index, col in enumerate(df.columns) if 'Percent' in col and df[col].mean() < 0.05]
    for col in percent_cols:
        df = df.drop(col, axis=1)
        df = df.drop(col.replace("Percent", "Estimate"), axis=1)
        
    df.to_csv("removed_columns.csv", index=False)
    return df

In [6]:
def classify_logistic_regression(train_df, val_df, test_df=None):
    y_train = train_df['female_dem']
    X_train = train_df.drop(['female_dem', 'party','sex'], axis=1)
    
    y_val = val_df['female_dem']
    X_val = val_df.drop(['female_dem', 'party','sex'], axis=1)
    
    classifier = LR()
    classifier.fit(X_train, y_train)
    pred = classifier.predict_proba(X_val)
    accuracy_pred = classifier.predict(X_val)
    
    if test_df is not None:
        y_test = test_df['female_dem']
        X_test = test_df.drop(['female_dem', 'party','sex'], axis=1)
        test_pred = classifier.predict_proba(X_test)
        accuracy_test_pred = classifier.predict(X_test)
        print("Test accuracy: {}".format(accuracy_score(y_test, accuracy_test_pred)))
    
    return accuracy_score(y_val, accuracy_pred)

In [7]:
def classify_svm(train_df, val_df, test_df=None):
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    
    y_val = val_df['sex']
    X_val = val_df.drop('sex', axis=1)
    
    classifier = SVC()
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_val)
    
    if test_df is not None:
        y_test = test_df['sex']
        X_test = test_df.drop('sex', axis=1)
        test_pred = classifier.predict(X_test)
        print("Test accuracy: {}".format(accuracy_score(y_test, test_pred)))

    return accuracy_score(y_val, pred)

In [8]:
def classify_knn(train_df, val_df, test_df=None):
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    
    y_val = val_df['sex']
    X_val = val_df.drop('sex', axis=1)
    accuracy = [0]*15
    for i in range(1, 16):
        classifier = KNN(n_neighbors=i)
        classifier.fit(X_train, y_train)
        pred = classifier.predict(X_val)
        accuracy[i-1] = accuracy_score(y_val, pred)
        
    if test_df is not None:
        classifier = KNN(n_neighbors=accuracy.index(max(accuracy)))
        classifier.fit(X_train, y_train)
        
        y_test = test_df['sex']
        X_test = test_df.drop('sex', axis=1)
        test_pred = classifier.predict(X_test)
        print("Test accuracy: {}".format(accuracy_score(y_test, test_pred)))
    return accuracy

In [9]:
def classify_dtc(train_df, val_df, test_df=None):
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    
    y_val = val_df['sex']
    X_val = val_df.drop('sex', axis=1)
    
    classifier_gini = DTC(random_state=40)
    classifier_entropy = DTC(criterion='entropy', random_state=40)
    
    classifier_gini.fit(X_train, y_train)
    classifier_entropy.fit(X_train, y_train)
    
    pred_gini = classifier_gini.predict(X_val)
    pred_entropy = classifier_entropy.predict(X_val)
    
    if test_df is not None:
        y_test = test_df['sex']
        X_test = test_df.drop('sex', axis=1)
        test_pred = classifier_entropy.predict(X_test)
        print("Test accuracy: {}".format(accuracy_score(y_test, test_pred)))
        
    return accuracy_score(y_val, pred_gini), accuracy_score(y_val, pred_entropy)

In [10]:
def independent_columns(A, tol =0): #= 1e-05):
    Q, R = np.linalg.qr(A)
    independent = np.where(np.abs(R.diagonal()) > tol)[0]
    
    return independent

In [11]:
def classify_ols(train_df, val_df):
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    
    y_val = val_df['sex']
    X_val = val_df.drop('sex', axis=1)
    
    independent = independent_columns(X_train)
    X_train = X_train.iloc[:, independent]
    X_val = X_val.iloc[:, independent]
    print("Rank is {}".format(X_train.shape[1]))
    X_train.to_csv("X_train.csv")

    classifier = sm.Logit(y_train, X_train)
    results = classifier.fit(method='ncg')
    pred = round(results.predict(X_val))
    return accuracy_score(y_val, pred)

In [12]:
def classify_glm(train_df, val_df):
    y_train = train_df['female_dem']
    X_train = train_df.drop(['female_dem', 'party','sex'], axis=1)
    
    y_val = val_df['female_dem']
    X_val = val_df.drop(['female_dem', 'party','sex'], axis=1)
    
    independent = independent_columns(X_train)
    X_train = X_train.iloc[:, independent]
    X_val = X_val.iloc[:, independent]
    print("Rank is {}".format(X_train.shape[1]))

    classifier = sm.GLM(y_train, X_train)
    results = classifier.fit()
    print(results.summary())
    pred = round(results.predict(X_val))
    return accuracy_score(y_val, pred)

In [15]:
senate_train_data, senate_val_data, senate_test_data = read_train_val_test("train_data/merged_senate_districts_2009_2012.csv", "valid_data/merged_senate_districts_2013_2014.csv", "test_data/merged_senate_districts_2015_2016.csv")
enumerate_districts(senate_train_data, "Senate")
enumerate_districts(senate_val_data, "Senate")
enumerate_districts(senate_test_data, "Senate")
    
cleaned_train_senate = clean_data(senate_train_data,"Senate")
cleaned_val_senate = clean_data(senate_val_data,"Senate")
cleaned_test_senate = clean_data(senate_test_data,"Senate")

print("Logistic Regression: ")
print(classify_logistic_regression(cleaned_train_senate, cleaned_val_senate))
print("SVM: ")
print(classify_svm(cleaned_train_senate, cleaned_val_senate))
print("K-Nearest Neighbors: ")
print(classify_knn(cleaned_train_senate, cleaned_val_senate))
print("Decision Tree Classifier: ")
print(classify_dtc(cleaned_train_senate, cleaned_val_senate))
# print("OLS Logistic Regression: ")
# print(classify_ols(cleaned_train_senate, cleaned_val_senate))
# print("OLS Generalized Linear Model: ")
# print(classify_glm(cleaned_train_senate, cleaned_val_senate))

Logistic Regression: 
0.658536585366
SVM: 
0.69512195122
K-Nearest Neighbors: 
[0.56097560975609762, 0.68292682926829273, 0.56097560975609762, 0.67073170731707321, 0.65853658536585369, 0.69512195121951215, 0.68292682926829273, 0.67073170731707321, 0.65853658536585369, 0.69512195121951215, 0.69512195121951215, 0.69512195121951215, 0.69512195121951215, 0.69512195121951215, 0.69512195121951215]
Decision Tree Classifier: 
(1.0, 1.0)


In [25]:
house_train_data, house_val_data, house_test_data = read_train_val_test("train_data/merged_house_districts_2009_2012.csv", "valid_data/merged_house_districts_2013_2014.csv", "test_data/merged_house_districts_2015_2016.csv")
enumerate_districts(house_train_data, "House")
enumerate_districts(house_val_data, "House")
enumerate_districts(house_test_data, "House")
    
cleaned_train_house = clean_data(house_train_data,"House")
cleaned_val_house = clean_data(house_val_data,"House")
cleaned_test_house = clean_data(house_test_data,"House")

print("Logistic Regression: ")
print(classify_logistic_regression(cleaned_train_house, cleaned_val_house))
# print("SVM: ")
# print(classify_svm(cleaned_train_house, cleaned_val_house))
# print("K-Nearest Neighbors: ")
# print(classify_knn(cleaned_train_house, cleaned_val_house))
# print("Decision Tree Classifier: ")
# print(classify_dtc(cleaned_train_house, cleaned_val_house))
# print("OLS Logistic Regression: ")
# print(classify_ols(cleaned_train_house, cleaned_val_house))
# print("OLS Generalized Linear Model: ")
# print(classify_glm(cleaned_train_house, cleaned_val_house))



(325, 164)
(172, 164)
(166, 164)
Logistic Regression: 
0.773255813953


In [23]:
merged_train = pd.concat([senate_train_data, house_train_data], axis=0)
merged_val = pd.concat([senate_val_data, house_val_data], axis=0)
merged_test = pd.concat([senate_test_data, house_test_data], axis=0)

enumerate_districts(merged_train)
enumerate_districts(merged_val)
enumerate_districts(merged_test)
    
cleaned_train = clean_data(merged_train)
cleaned_val = clean_data(merged_val)
cleaned_test = clean_data(house_test_data)

print("Logistic Regression: ")
print(classify_logistic_regression(cleaned_train, cleaned_val))
# print("SVM: ")
# print(classify_svm(cleaned_train, cleaned_val))
# print("K-Nearest Neighbors: ")
# print(classify_knn(cleaned_train, cleaned_val))
# print("Decision Tree Classifier: ")
# print(classify_dtc(cleaned_train, cleaned_val))
# print("OLS Logistic Regression: ")
# print(classify_ols(cleaned_train, cleaned_val))
# print("OLS Generalized Linear Model: ")
# print(classify_glm(cleaned_train, cleaned_val))




(415, 164)
(254, 164)
(166, 164)
Logistic Regression: 
0.590551181102


In [27]:
print("Test Senate Logisitic Regression Classifier: ")
print(classify_logistic_regression(cleaned_train_senate, cleaned_val_senate, cleaned_test_senate))

Test Senate Logisitic Regression Classifier: 
[[  9.99965390e-01   3.46099948e-05]
 [  6.37149962e-03   9.93628500e-01]
 [  6.37149962e-03   9.93628500e-01]
 [  9.88530634e-01   1.14693664e-02]
 [  9.88530634e-01   1.14693664e-02]
 [  1.57166108e-02   9.84283389e-01]
 [  3.89462606e-02   9.61053739e-01]
 [  1.57166108e-02   9.84283389e-01]
 [  3.89462606e-02   9.61053739e-01]
 [  9.99999999e-01   5.15041153e-10]
 [  9.99999999e-01   5.15041153e-10]
 [  2.90487903e-06   9.99997095e-01]
 [  2.90487903e-06   9.99997095e-01]
 [  9.95303352e-01   4.69664764e-03]
 [  9.95303352e-01   4.69664764e-03]
 [  1.74324351e-06   9.99998257e-01]
 [  1.74324351e-06   9.99998257e-01]
 [  9.26194287e-08   9.99999907e-01]
 [  9.26194287e-08   9.99999907e-01]
 [  5.33283046e-01   4.66716954e-01]
 [  5.33283046e-01   4.66716954e-01]
 [  9.99726419e-01   2.73581353e-04]
 [  9.99726419e-01   2.73581353e-04]
 [  9.99889705e-01   1.10295457e-04]
 [  9.99961645e-01   3.83548385e-05]
 [  9.99961645e-01   3.835483

TypeError: type numpy.ndarray doesn't define __round__ method

In [17]:
print("Test House KNN Classifier: ")
print(classify_knn(cleaned_train_house, cleaned_val_house, cleaned_test_house))

Test House KNN Classifier: 
Test accuracy: 0.7650602409638554
[0.71511627906976749, 0.7558139534883721, 0.69767441860465118, 0.75, 0.7441860465116279, 0.7558139534883721, 0.73837209302325579, 0.76162790697674421, 0.7558139534883721, 0.76162790697674421, 0.7558139534883721, 0.76162790697674421, 0.76162790697674421, 0.76744186046511631, 0.76744186046511631]
