In [158]:
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
import numpy as np
from sklearn.tree import DecisionTreeClassifier as DTC
import statsmodels.api as sm
from pandas.core import datetools
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [192]:
def read_train_val_test(train_path, val_path, test_path):
    train_data = pd.read_csv(train_path)
    val_data = pd.read_csv(val_path)
    test_data = pd.read_csv(test_path)
    return train_data, val_data, test_data

In [189]:
senate_name_dict = {}
house_name_dict = {}
name_dict = {}

def enumerate_districts(df, type=None):
    names = df['District']
    count = 0
    for name in names:
        if type == 'senate':
            if name not in senate_name_dict:
                senate_name_dict[name] = count
                count += 1
        elif type == 'house':
            if name not in house_name_dict:
                house_name_dict[name] = count
                count += 1
        else:
            if name not in name_dict:
                name_dict[name] = count
                count += 1

def replace_district(x, type=None):
    if type == "senate":
        return senate_name_dict[x]
    elif type == "house":
        return house_name_dict[x]
    else:
        return name_dict[x]

In [219]:
def clean_data(df, phase, type=None):
    
    # replace district with number 
    df['District'] = df["District"].apply(lambda x: replace_district(x, type))
    
    # code gender and party
    df = df.drop("name", axis=1)
    df['sex'].replace('f', 1, inplace=True)
    df['sex'].replace('m', 0, inplace=True)
    df['party'].replace('Democratic', 1, inplace=True)
    df['party'].replace('Republican', 0, inplace=True)

    # fill NaN's with mean from column
    df['sex'] = df['sex'].fillna(df['sex'].mean())
    df['party'] = df['party'].fillna(df['party'].mean())
    df['Amount'] = df['Amount'].fillna(df['Amount'].mean())   
    df['vote_count'] = df['vote_count'].apply(lambda x: str(x).replace(",", "").replace('nan', 'NaN')).astype(float)
    df['vote_count'] = df['vote_count'].fillna(df['vote_count'].mean())
    df['vote_percent'] = df['vote_percent'].fillna(df['vote_percent'].mean())
        
    # add indicator for female democrat
    df['female_dem'] = 0
    for index, row in df.iterrows():
        if row.sex == 1 and row.party == 1:
            df.set_value(index, 'female_dem', 1)
    
    # remove "(percent) margin of error" columns
    df = df.iloc[:, [index for index, x in enumerate(df.columns) if 'Margin' not in x]]
    
    # remove columns with low percent contributions
    percent_cols = [col for index, col in enumerate(df.columns) if 'Percent' in col and df[col].mean() < 0.05]
    for col in percent_cols:
        df = df.drop(col, axis=1)
        df = df.drop(col.replace("Percent", "Estimate"), axis=1)
        
    if type == None:
        df.to_csv('cleaned_data/cleaned_data_merged_' + phase + '.csv', index = False)
    
    else:
        df.to_csv('cleaned_data/cleaned_data_' + str(type) + '_' + phase + '.csv', index=False)
                  
    return df

In [220]:
senate_train_data, senate_val_data, senate_test_data = read_train_val_test("train_data/merged_senate_districts_2009_2012.csv", "valid_data/merged_senate_districts_2013_2014.csv", "test_data/merged_senate_districts_2015_2016.csv")
enumerate_districts(senate_train_data, 'senate')
enumerate_districts(senate_val_data, 'senate')
enumerate_districts(senate_test_data, 'senate')

house_train_data, house_val_data, house_test_data = read_train_val_test("train_data/merged_house_districts_2009_2012.csv", "valid_data/merged_house_districts_2013_2014.csv", "test_data/merged_house_districts_2015_2016.csv")
enumerate_districts(house_train_data, 'house')
enumerate_districts(house_val_data, 'house')
enumerate_districts(house_test_data, 'house')

merged_train = pd.concat([senate_train_data, house_train_data], axis=0)
merged_val = pd.concat([senate_val_data, house_val_data], axis=0)
merged_test = pd.concat([senate_test_data, house_test_data], axis=0)
enumerate_districts(merged_train)
enumerate_districts(merged_val)
enumerate_districts(merged_test)

cleaned_train_senate = clean_data(senate_train_data, 'train', 'senate')
cleaned_val_senate = clean_data(senate_val_data, 'val', 'senate')
cleaned_test_senate = clean_data(senate_test_data, 'test', 'senate')

cleaned_train_house = clean_data(house_train_data, 'train', 'house')
cleaned_val_house = clean_data(house_val_data, 'val', 'house')
cleaned_test_house = clean_data(house_test_data, 'test', 'house')

cleaned_train_merged = clean_data(merged_train, 'train')
cleaned_val_merged = clean_data(merged_val, 'test')
cleaned_test_merged = clean_data(merged_test, 'val')

In [221]:
def classify_logistic_regression(train_df, val_df, indicator, test_df=None):
    
    if indicator == 'female':
        y_train = train_df['sex']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['sex']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)
    
    elif indicator == 'female_dem':
        y_train = train_df['female_dem']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['female_dem']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)
        
    # scale data    
    X_train = scale(X_train)
    X_val = scale(X_val)
    
#     std_scaler = StandardScaler()
#     X_train = std_scaler.fit_transform(X_train)
#     X_val = std_scaler.fit_transform(X_val)
    
#     mm_scaler = MinMaxScaler()
#     X_train = mm_scaler.fit_transform(X_train)
#     X_val = mm_scaler.fit_transform(X_val)
        
    classifier = LR()
    classifier.fit(X_train, y_train)
    pred = classifier.predict_proba(X_val)
    pred_round = [round(x) for x in pred]
        
    if test_df is not None:
        
        if indicator == 'female':
            y_test = test_def['sex']
            X_test = test_def.drop(['sex', 'female_dem'], axis=1)
    
        elif indicator == 'female_dem':
            y_test = test_df['female_dem']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)
            
        test_pred = classifier.predict_proba(X_test)
        test_pred_round = [round(x) for x in test_pred]

        return test_pred_round, accuracy_score(y_test, test_pred_round)
    
    return pred, accuracy_score(y_val, pred_round)

In [109]:
def classify_svm(train_df, val_df, test_df=None):

    if indicator == 'female':
    y_train = train_df['sex']
    X_train = train_df.drop(['sex', 'female_dem'], axis=1)

    y_val = val_df['sex']
    X_val = val_df.drop(['sex', 'female_dem'], axis=1)
    
    elif indicator == 'female_dem':
        y_train = train_df['female_dem']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['female_dem']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)
        
    # scale data    
    X_train = scale(X_train)
    X_val = scale(X_val)
    
#     std_scaler = StandardScaler()
#     X_train = std_scaler.fit_transform(X_train)
#     X_val = std_scaler.fit_transform(X_val)
    
#     mm_scaler = MinMaxScaler()
#     X_train = mm_scaler.fit_transform(X_train)
#     X_val = mm_scaler.fit_transform(X_val)
    
    classifier = SVC()
    classifier.fit(X_train, y_train)
      
    pred = classifier.predict_proba(X_val)
    pred_round = [round(x) for x in pred]
    
    if test_df is not None:
        
        if indicator == 'female':
            y_test = test_def['sex']
            X_test = test_def.drop(['sex', 'female_dem'], axis=1)
    
        elif indicator == 'female_dem':
            y_test = test_df['female_dem']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)

        test_pred = classifier.predict_proba(X_test)
        test_pred_round = [round(x) for x in test_pred]

        return test_pred_round, accuracy_score(y_test, test_pred_round)
    
    return pred, accuracy_score(y_val, pred_round)

In [127]:
def classify_knn(train_df, val_df, test_df=None):

    if indicator == 'female':
        y_train = train_df['sex']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['sex']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)

    elif indicator == 'female_dem':
        y_train = train_df['female_dem']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['female_dem']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)
        
    # scale data    
    X_train = scale(X_train)
    X_val = scale(X_val)
    
#     std_scaler = StandardScaler()
#     X_train = std_scaler.fit_transform(X_train)
#     X_val = std_scaler.fit_transform(X_val)
    
#     mm_scaler = MinMaxScaler()
#     X_train = mm_scaler.fit_transform(X_train)
#     X_val = mm_scaler.fit_transform(X_val)
    
    neigbors = list(range(1,16))
    accuracy = []
    preds = []
        
    for k in neighbors:
        classifier = KNN(n_neighbors=k)
        classifier.fit(X_train, y_train)
        preds.append(classifier.predict_proba(X_val))
        accuracy.append(accuracy_score(y_val, pred))
        
    best_k, best_accuracy = max(accuracy)
    pred = preds[best_k]
    pred_round = [round(x) for x in pred]
    
    classifier = KNN(n_neighbors=best_k)
    classifier.fit(X_train, y_train)
        
    if test_df is not None:
        
        if indicator == 'female':
            y_test = test_def['sex']
            X_test = test_def.drop(['sex', 'female_dem'], axis=1)
    
        elif indicator == 'female_dem':
            y_test = test_df['female_dem']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)

        test_pred = classifier.predict_proba(X_test)
        test_pred_round = [round(x) for x in test_pred]
        
        return test_pred_round, accuracy_score(y_test, test_pred_round)

    return pred, accuracy_score(y_val, pred_round)

In [128]:
def classify_dtc(train_df, val_df, test_df=None):
    
    if indicator == 'female':
        y_train = train_df['sex']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['sex']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)

    elif indicator == 'female_dem':
        y_train = train_df['female_dem']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['female_dem']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)
        
    # scale data    
    X_train = scale(X_train)
    X_val = scale(X_val)
    
#     std_scaler = StandardScaler()
#     X_train = std_scaler.fit_transform(X_train)
#     X_val = std_scaler.fit_transform(X_val)
    
#     mm_scaler = MinMaxScaler()
#     X_train = mm_scaler.fit_transform(X_train)
#     X_val = mm_scaler.fit_transform(X_val)
    
    classifier_gini = DTC(random_state=40)
    classifier_entropy = DTC(criterion='entropy', random_state=40)
    
    classifier_gini.fit(X_train, y_train)
    classifier_entropy.fit(X_train, y_train)
    
    pred_gini = classifier_gini.predict_proba(X_val)
    pred_entropy = classifier_entropy.predict_proba(X_val)
    
    pred_round_gini = [round(x) for x in pred_gini]
    pred_round_entropy = [round(x) for x in pred_entropy]
    
    accuracy_score_gini = accuracy_score(y_val, pred_round_gini)
    accuracy_score_entropy = accuracy_score(y_val, pred_round_entropy)
    
    classifer = None
    best_accuracy = None
    best_preds = None
    
    if accuracy_score_gini > accuracy_score_entropy:
        classifer = DTC(random_state=40)
        best_accuracy = accuracy_score_gini
        best_preds = pred_round_gini
    
    else:
        classifier = DTC(criterion='entropy', random_state=40)
        best_accuracy = accuracy_score_entropy
        best_preds = pred_round_entropy        
    
    if test_df is not None:
        
        if indicator == 'female':
            y_test = test_def['sex']
            X_test = test_def.drop(['sex', 'female_dem'], axis=1)
    
        elif indicator == 'female_dem':
            y_test = test_df['female_dem']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)

        test_pred = classifier.predict_proba(X_test)
        test_pred_round = [round(x) for x in test_pred]
        
        return test_pred_round, accuracy_score(y_test, test_pred_round)
        
    return best_preds, best_accuracy

In [129]:
def independent_columns(A, tol =0): #= 1e-05):
    Q, R = np.linalg.qr(A)
    independent = np.where(np.abs(R.diagonal()) > tol)[0]
    
    return independent

In [130]:
def classify_ols(train_df, val_df):
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    
    y_val = val_df['sex']
    X_val = val_df.drop('sex', axis=1)
    
    independent = independent_columns(X_train)
    X_train = X_train.iloc[:, independent]
    X_val = X_val.iloc[:, independent]
    print("Rank is {}".format(X_train.shape[1]))
    X_train.to_csv("X_train.csv")

    classifier = sm.Logit(y_train, X_train)
    results = classifier.fit(method='ncg')
    pred = round(results.predict(X_val))
    return accuracy_score(y_val, pred)

In [131]:
def classify_glm(train_df, val_df):
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    
    y_val = val_df['sex']
    X_val = val_df.drop('sex', axis=1)
    
    independent = independent_columns(X_train)
    X_train = X_train.iloc[:, independent]
    X_val = X_val.iloc[:, independent]
    print("Rank is {}".format(X_train.shape[1]))

    classifier = sm.GLM(y_train, X_train)
    results = classifier.fit()
    #print(results.summary())
    pred = round(results.predict(X_val))
    return accuracy_score(y_val, pred)

In [132]:
print("Logistic Regression: ")
print(classify_logistic_regression(cleaned_train_senate, cleaned_val_senate))
print("SVM: ")
print(classify_svm(cleaned_train_senate, cleaned_val_senate))
print("K-Nearest Neighbors: ")
print(classify_knn(cleaned_train_senate, cleaned_val_senate))
print("Decision Tree Classifier: ")
print(classify_dtc(cleaned_train_senate, cleaned_val_senate))
print("OLS Logistic Regression: ")
print(classify_ols(cleaned_train_senate, cleaned_val_senate))
print("OLS Generalized Linear Model: ")
print(classify_glm(cleaned_train_senate, cleaned_val_senate))

{'Berkshire, Hampshire & Franklin': 0, 'Bristol & Norfolk': 1, 'Cape & Islands': 2, 'First Bristol & Plymouth': 3, 'First Essex': 4, 'First Essex & Middlesex': 5, 'First Hampden & Hampshire': 6, 'First Middlesex': 7, 'First Middlesex & Norfolk': 8, 'First Plymouth & Bristol': 9, 'First Suffolk': 10, 'First Suffolk & Middlesex': 11, 'First Worcester': 12, 'Fourth Middlesex': 13, 'Hampden': 14, 'Hampshire & Franklin': 15, 'Middlesex & Essex': 16, 'Middlesex & Worcester': 17, 'Middlesex, Suffolk & Essex': 18, 'Norfolk & Plymouth': 19, 'Norfolk, Bristol & Middlesex': 20, 'Norfolk, Bristol & Plymouth': 21, 'Plymouth & Barnstable': 22, 'Plymouth & Norfolk': 23, 'Second Bristol & Plymouth': 24, 'Second Essex': 25, 'Second Essex & Middlesex': 26, 'Second Hampden & Hampshire': 27, 'Second Middlesex': 28, 'Second Middlesex & Norfolk': 29, 'Second Plymouth & Bristol': 30, 'Second Suffolk': 31, 'Second Suffolk & Middlesex': 32, 'Second Worcester': 33, 'Suffolk & Norfolk': 34, 'Third Essex & Middle



In [133]:
house_train_data, house_val_data, house_test_data = read_train_val_test("train_data/merged_house_districts_2009_2012.csv", "valid_data/merged_house_districts_2013_2014.csv", "test_data/merged_house_districts_2015_2016.csv")
enumerate_districts(house_train_data, "house")
enumerate_districts(house_val_data, "house")
enumerate_districts(house_test_data, "house")
print(house_name_dict)
    
cleaned_train_house = clean_data(house_train_data,"house")
cleaned_val_house = clean_data(house_val_data,"house")
cleaned_test_house = clean_data(house_test_data,"house")

print("Logistic Regression: ")
print(classify_logistic_regression(cleaned_train_house, cleaned_val_house))
print("SVM: ")
print(classify_svm(cleaned_train_house, cleaned_val_house))
print("K-Nearest Neighbors: ")
print(classify_knn(cleaned_train_house, cleaned_val_house))
print("Decision Tree Classifier: ")
print(classify_dtc(cleaned_train_house, cleaned_val_house))
print("OLS Logistic Regression: ")
print(classify_ols(cleaned_train_house, cleaned_val_house))
print("OLS Generalized Linear Model: ")
print(classify_glm(cleaned_train_house, cleaned_val_house))

{'10th Bristol': 0, '10th Essex': 1, '10th Hampden': 2, '10th Middlesex': 3, '10th Norfolk': 4, '10th Plymouth': 5, '10th Suffolk': 6, '10th Worcester': 7, '11th Bristol': 8, '11th Essex': 9, '11th Hampden': 10, '11th Middlesex': 11, '11th Norfolk': 12, '11th Plymouth': 13, '11th Suffolk': 14, '11th Worcester': 15, '12th Bristol': 16, '12th Essex': 17, '12th Hampden': 18, '12th Middlesex': 19, '12th Norfolk': 20, '12th Plymouth': 21, '12th Suffolk': 22, '12th Worcester': 23, '13th Bristol': 24, '13th Essex': 25, '13th Middlesex': 26, '13th Norfolk': 27, '13th Suffolk': 28, '13th Worcester': 29, '14th Bristol': 30, '14th Essex': 31, '14th Middlesex': 32, '14th Norfolk': 33, '14th Suffolk': 34, '14th Worcester': 35, '15th Essex': 36, '15th Middlesex': 37, '15th Norfolk': 38, '15th Suffolk': 39, '15th Worcester': 40, '16th Essex': 41, '16th Middlesex': 42, '16th Suffolk': 43, '16th Worcester': 44, '17th Essex': 45, '17th Middlesex': 46, '17th Suffolk': 47, '17th Worcester': 48, '18th Esse



In [134]:
merged_train = pd.concat([senate_train_data, house_train_data], axis=0)
merged_val = pd.concat([senate_val_data, house_val_data], axis=0)
merged_test = pd.concat([senate_test_data, house_test_data], axis=0)

enumerate_districts(merged_train)
enumerate_districts(merged_val)
enumerate_districts(merged_test)
    
cleaned_train = clean_data(merged_train)
cleaned_val = clean_data(merged_val)
cleaned_test = clean_data(house_test_data)

print("Logistic Regression: ")
print(classify_logistic_regression(cleaned_train, cleaned_val))
print("SVM: ")
print(classify_svm(cleaned_train, cleaned_val))
print("K-Nearest Neighbors: ")
print(classify_knn(cleaned_train, cleaned_val))
print("Decision Tree Classifier: ")
print(classify_dtc(cleaned_train, cleaned_val))
print("OLS Logistic Regression: ")
print(classify_ols(cleaned_train, cleaned_val))
print("OLS Generalized Linear Model: ")
print(classify_glm(cleaned_train, cleaned_val))


(415, 163)
(254, 163)
(166, 163)
Logistic Regression: 
0.759842519685
SVM: 
0.763779527559
K-Nearest Neighbors: 
-8.31238824446e-17
0.913385826772
Decision Tree Classifier: 
(0.74409448818897639, 0.75196850393700787)
OLS Logistic Regression: 
Rank is 162
Optimization terminated successfully.
         Current function value: 0.594055
         Iterations: 1
         Function evaluations: 2
         Gradient evaluations: 2
         Hessian evaluations: 1
0.744094488189
OLS Generalized Linear Model: 
Rank is 162
0.657480314961




In [138]:
print(enumerate_districts(house_test_data))
      
# enumerate_districts(merged_train)
# enumerate_districts(merged_val)
# enumerate_districts(merged_test)
    
# cleaned_train = clean_data(merged_train)
# cleaned_val = clean_data(merged_val)
# cleaned_test = clean_data(house_test_data)

# print("Logistic Regression: ")
# print(classify_logistic_regression(cleaned_train, cleaned_val))
# print("SVM: ")
# print(classify_svm(cleaned_train, cleaned_val))
# print("K-Nearest Neighbors: ")
# print(classify_knn(cleaned_train, cleaned_val))
# print("Decision Tree Classifier: ")
# print(classify_dtc(cleaned_train, cleaned_val))
# print("OLS Logistic Regression: ")
# print(classify_ols(cleaned_train, cleaned_val))
# print("OLS Generalized Linear Model: ")
# print(classify_glm(cleaned_train, cleaned_val))


None


In [125]:
print("Test senate Decision Tree Classifier: ")
print(classify_dtc(cleaned_train_senate, cleaned_val_senate, cleaned_test_senate))

Test Senate Decision Tree Classifier: 
Test accuracy: 0.7647058823529411
(0.76829268292682928, 0.68292682926829273)


In [126]:
print("Test house KNN Classifier: ")
print(classify_knn(cleaned_train_house, cleaned_val_house, cleaned_test_house))

Test House KNN Classifier: 
-8.90708842548e-18


ValueError: Expected n_neighbors > 0. Got 0