In [53]:
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
import numpy as np
from sklearn.tree import DecisionTreeClassifier as DTC
import statsmodels.api as sm
from pandas.core import datetools
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [54]:
def read_train_val_test(train_path, val_path, test_path):
    train_data = pd.read_csv(train_path)
    val_data = pd.read_csv(val_path)
    test_data = pd.read_csv(test_path)
    return train_data, val_data, test_data

In [55]:
senate_name_dict = {}
house_name_dict = {}
name_dict = {}

def enumerate_districts(df, type=None):
    names = df['District']
    count = 0
    for name in names:
        if type == 'senate':
            if name not in senate_name_dict:
                senate_name_dict[name] = count
                count += 1
        elif type == 'house':
            if name not in house_name_dict:
                house_name_dict[name] = count
                count += 1
        else:
            if name not in name_dict:
                name_dict[name] = count
                count += 1

def replace_district(x, type=None):
    if type == "senate":
        return senate_name_dict[x]
    elif type == "house":
        return house_name_dict[x]
    else:
        return name_dict[x]

In [56]:
def clean_data(df, phase, type=None):
    
    # replace district with number 
    df['District'] = df["District"].apply(lambda x: replace_district(x, type))
    
    # code gender and party
    df = df.drop("name", axis=1)
    df['sex'] = df['sex'].fillna(1) #(int(round((df['sex'].mean()))))
    df['sex'].replace('f', 1, inplace=True)
    df['sex'].replace('m', 0, inplace=True)
    df['sex'] = df['sex'].astype(int)
    df['party'].replace('Democratic', 1, inplace=True)
    df['party'].replace('Republican', 0, inplace=True)

    # fill NaN's with mean from column
    df['party'] = df['party'].fillna(df['party'].mean())
    df['Amount'] = df['Amount'].fillna(df['Amount'].mean())   
    df['vote_count'] = df['vote_count'].apply(lambda x: str(x).replace(",", "").replace('nan', 'NaN')).astype(float)
    df['vote_count'] = df['vote_count'].fillna(df['vote_count'].mean())
    df['vote_percent'] = df['vote_percent'].fillna(df['vote_percent'].mean())
        
    # add indicator for female democrat
    df['female_dem'] = 0
    for index, row in df.iterrows():
        if row.sex == 1 and row.party == 1:
            df.set_value(index, 'female_dem', 1)
    
    # remove "(percent) margin of error" columns
    df = df.iloc[:, [index for index, x in enumerate(df.columns) if 'Margin' not in x]]
    
    # remove columns with low percent contributions
    percent_cols = [col for index, col in enumerate(df.columns) if 'Percent' in col and df[col].mean() < 0.05]
    for col in percent_cols:
        df = df.drop(col, axis=1)
        df = df.drop(col.replace("Percent", "Estimate"), axis=1)
        
    if type == None:
        df.to_csv('cleaned_data/cleaned_data_merged_' + phase + '.csv', index = False)
    
    else:
        df.to_csv('cleaned_data/cleaned_data_' + str(type) + '_' + phase + '.csv', index=False)
                  
    return df

In [58]:
senate_train_data, senate_val_data, senate_test_data = read_train_val_test("train_data/merged_senate_districts_2009_2012.csv", "valid_data/merged_senate_districts_2013_2014.csv", "test_data/merged_senate_districts_2015_2016.csv")
enumerate_districts(senate_train_data, 'senate')
enumerate_districts(senate_val_data, 'senate')
enumerate_districts(senate_test_data, 'senate')

house_train_data, house_val_data, house_test_data = read_train_val_test("train_data/merged_house_districts_2009_2012.csv", "valid_data/merged_house_districts_2013_2014.csv", "test_data/merged_house_districts_2015_2016.csv")
enumerate_districts(house_train_data, 'house')
enumerate_districts(house_val_data, 'house')
enumerate_districts(house_test_data, 'house')

merged_train = pd.concat([senate_train_data, house_train_data], axis=0)
merged_val = pd.concat([senate_val_data, house_val_data], axis=0)
merged_test = pd.concat([senate_test_data, house_test_data], axis=0)
enumerate_districts(merged_train)
enumerate_districts(merged_val)
enumerate_districts(merged_test)

cleaned_train_senate = clean_data(senate_train_data, 'train', 'senate')
cleaned_val_senate = clean_data(senate_val_data, 'val', 'senate')
cleaned_test_senate = clean_data(senate_test_data, 'test', 'senate')

cleaned_train_house = clean_data(house_train_data, 'train', 'house')
cleaned_val_house = clean_data(house_val_data, 'val', 'house')
cleaned_test_house = clean_data(house_test_data, 'test', 'house')

cleaned_train_merged = clean_data(merged_train, 'train')
cleaned_val_merged = clean_data(merged_val, 'test')
cleaned_test_merged = clean_data(merged_test, 'val')

In [180]:
def classify_lr(train_df, val_df, indicator, test_df=None):
    
    if indicator == 'female':
        y_train = train_df['sex']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['sex']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)
    
    elif indicator == 'female_dem':
        y_train = train_df['female_dem']
        X_train = train_df.drop(['sex', 'female_dem', 'party'], axis=1)

        y_val = val_df['female_dem']
        X_val = val_df.drop(['sex', 'female_dem', 'party'], axis=1)
        
#   scale data    
#     X_train = scale(X_train)
#     X_val = scale(X_val)
    
    std_scaler = StandardScaler()
    X_train = std_scaler.fit_transform(X_train)
    X_val = std_scaler.fit_transform(X_val)
    
#     mm_scaler = MinMaxScaler()
#     X_train = mm_scaler.fit_transform(X_train)
#     X_val = mm_scaler.fit_transform(X_val)
        
    classifier = LR()
    classifier.fit(X_train, y_train)
    pred = classifier.predict_proba(X_val)
    pred = np.delete(pred, 1, 1)    
    pred_round =  classifier.predict(X_val)
                
    if test_df is not None:        
        if indicator == 'female':
            y_test = test_df['sex']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)
    
        elif indicator == 'female_dem':
            y_test = test_df['female_dem']
            X_test = test_df.drop(['sex', 'female_dem', 'party'], axis=1)
         
        #X_test = scale(X_test)
        X_test = std_scaler.fit_transform(X_test)
        
        test_pred = classifier.predict_proba(X_test)
        test_pred = np.delete(test_pred, 1, 1)    
        test_pred_round = classifier.predict(X_test)
        
        return test_pred, accuracy_score(y_test, test_pred_round)
    
    return pred, accuracy_score(y_val, pred_round)

In [181]:
def classify_svm(train_df, val_df, indicator, test_df=None):

    if indicator == 'female':
        y_train = train_df['sex']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['sex']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)

    elif indicator == 'female_dem':
        y_train = train_df['female_dem']
        X_train = train_df.drop(['sex', 'female_dem', 'party'], axis=1)

        y_val = val_df['female_dem']
        X_val = val_df.drop(['sex', 'female_dem', 'party'], axis=1)
        
# scale data    
#     X_train = scale(X_train)
#     X_val = scale(X_val)
    
    std_scaler = StandardScaler()
    X_train = std_scaler.fit_transform(X_train)
    X_val = std_scaler.fit_transform(X_val)
    
#     mm_scaler = MinMaxScaler()
#     X_train = mm_scaler.fit_transform(X_train)
#     X_val = mm_scaler.fit_transform(X_val)
    
    classifier = SVC(probability=True)
    classifier.fit(X_train, y_train)
      
    pred = classifier.predict_proba(X_val)
    pred = np.delete(pred, 1, 1)    
    pred_round = classifier.predict(X_val)
    
    if test_df is not None:
        
        if indicator == 'female':
            y_test = test_df['sex']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)

        elif indicator == 'female_dem':
            y_test = test_df['female_dem']
            X_test = test_df.drop(['sex', 'female_dem', 'party'], axis=1)

        #X_test = scale(X_test)
        X_test = std_scaler.fit_transform(X_test)

        test_pred = classifier.predict_proba(X_test)
        test_pred = np.delete(test_pred, 1, 1)    
        test_pred_round = classifier.predict(X_test)

        return test_pred, accuracy_score(y_test, test_pred_round)
    
    return pred, accuracy_score(y_val, pred_round)

In [182]:
import math

def classify_knn(train_df, val_df, indicator, test_df=None):

    if indicator == 'female':
        y_train = train_df['sex']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['sex']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)

    elif indicator == 'female_dem':
        y_train = train_df['female_dem']
        X_train = train_df.drop(['sex', 'female_dem', 'party'], axis=1)

        y_val = val_df['female_dem']
        X_val = val_df.drop(['sex', 'female_dem', 'party'], axis=1)
        
#   scale data    
#     X_train = scale(X_train)
#     X_val = scale(X_val)
    
    std_scaler = StandardScaler()
    X_train = std_scaler.fit_transform(X_train)
    X_val = std_scaler.fit_transform(X_val)
    
#     mm_scaler = MinMaxScaler()
#     X_train = mm_scaler.fit_transform(X_train)
#     X_val = mm_scaler.fit_transform(X_val)
    
    neighbors = list(range(1,16))
    accuracy = []
    preds = np.zeros((y_val.shape[0],1))
    preds_round = np.zeros((y_val.shape[0],1))
        
    for k in neighbors:
        classifier = KNN(n_neighbors=k)
        classifier.fit(X_train, y_train)
        k_pred = classifier.predict_proba(X_val)
        k_pred = np.delete(k_pred, 1, 1) 
        preds = np.concatenate((preds, k_pred), axis=1)
        k_pred_round = (classifier.predict(X_val)).reshape((y_val.shape[0],1))
        preds_round = np.concatenate((preds_round, k_pred_round), axis=1)
        accuracy.append(accuracy_score(y_val, k_pred_round))
        
    preds = np.delete(preds, 0, 1)
    preds_round = np.delete(preds_round, 0, 1)
    
    best_k = None
    best_accuracy = -math.inf
    
    for k, acc in enumerate(accuracy):
        if acc > best_accuracy:
            best_accuracy = acc
            best_k = k + 1  
    
    pred = preds[:,best_k]
    pred_round = preds_round[:,best_k]
    classifier = KNN(n_neighbors=best_k)
    classifier.fit(X_train, y_train)
        
    if test_df is not None:
        
        if indicator == 'female':
            y_test = test_df['sex']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)
    
        elif indicator == 'female_dem':
            y_test = test_df['female_dem']
            X_test = test_df.drop(['sex', 'female_dem', 'party'], axis=1)

        #X_test = scale(X_test)
        X_test = std_scaler.fit_transform(X_test)

        test_pred = classifier.predict_proba(X_test)
        test_pred = np.delete(test_pred, 1, 1)    
        test_pred_round = classifier.predict(X_test)
        
        return test_pred, accuracy_score(y_test, test_pred_round), best_k

    return pred, accuracy_score(y_val, pred_round), best_k

In [183]:
def classify_dtc(train_df, val_df, indicator, test_df=None):
    
    if indicator == 'female':
        y_train = train_df['sex']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['sex']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)

    elif indicator == 'female_dem':
        y_train = train_df['female_dem']
        X_train = train_df.drop(['sex', 'female_dem', 'party'], axis=1)

        y_val = val_df['female_dem']
        X_val = val_df.drop(['sex', 'female_dem', 'party'], axis=1)
        
#   scale data    
#     X_train = scale(X_train)
#     X_val = scale(X_val)
    
    std_scaler = StandardScaler()
    X_train = std_scaler.fit_transform(X_train)
    X_val = std_scaler.fit_transform(X_val)
    
#     mm_scaler = MinMaxScaler()
#     X_train = mm_scaler.fit_transform(X_train)
#     X_val = mm_scaler.fit_transform(X_val)
    
    classifier_gini = DTC(random_state=40)
    classifier_entropy = DTC(criterion='entropy', random_state=40)
    
    classifier_gini.fit(X_train, y_train)
    classifier_entropy.fit(X_train, y_train)
    
    pred_gini = classifier_gini.predict_proba(X_val)
    pred_gini = np.delete(pred_gini, 1, 1)    
    pred_entropy = classifier_entropy.predict_proba(X_val)
    pred_entropy = np.delete(pred_entropy, 1, 1)    
    pred_round_gini = classifier_gini.predict(X_val)
    pred_round_entropy = classifier_entropy.predict(X_val)
    
    accuracy_score_gini = accuracy_score(y_val, pred_round_gini)
    accuracy_score_entropy = accuracy_score(y_val, pred_round_entropy)
    
    classifer = None
    best_accuracy = None
    best_preds = None
    
    if accuracy_score_gini > accuracy_score_entropy:
        classifier = DTC(random_state=40)
        classifier.fit(X_train, y_train)

        best_accuracy = accuracy_score_gini
        best_preds = pred_round_gini
    
    else:
        classifier = DTC(criterion='entropy', random_state=40)
        classifier.fit(X_train, y_train)

        best_accuracy = accuracy_score_entropy
        best_preds = pred_round_entropy        
    
    if test_df is not None:
              
        if indicator == 'female':
            y_test = test_df['sex']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)
    
        elif indicator == 'female_dem':
            y_test = test_df['female_dem']
            X_test = test_df.drop(['sex', 'female_dem', 'party'], axis=1)

        #X_test = scale(X_test)
        X_test = std_scaler.fit_transform(X_test)

        test_pred = classifier.predict_proba(X_test)
        test_pred = np.delete(test_pred, 1, 1)    
        test_pred_round = classifier.predict(X_test)
        
        return test_pred, accuracy_score(y_test, test_pred_round)
        
    return best_preds, best_accuracy

In [143]:
def senate_results():
    tests = [None, cleaned_test_senate]
    indicators = ['female', 'female_dem']
    print_outputs = ['Senate, val, indicator=female, LR: ',
                   'Senate, val, indicator=female, SVM: ',
                   'Senate, val, indicator=female, KNN: ',
                   'Senate, val, indicator=female, DTC: ',
                    'Senate, val, indicator=femaleDem, LR: ',
                   'Senate, val, indicator=femaleDem, SVM: ',
                   'Senate, val, indicator=femaleDem, KNN: ',
                   'Senate, val, indicator=femaleDem, DTC: ',
                    'Senate, test, indicator=female, LR: ',
                   'Senate, test, indicator=female, SVM: ',
                   'Senate, test, indicator=female, KNN: ',
                   'Senate, test, indicator=female, DTC: ',
                   'Senate, test, indicator=femaleDem, LR: ',
                   'Senate, test, indicator=femaleDem, SVM: ',
                   'Senate, test, indicator=femaleDem, KNN: ',
                   'Senate, test, indicator=femaleDem, DTC: ']
    print_index = 0
    preds_keys = ['lr_val_fem_pred', 'svm_val_fem_pred', 'knn_val_fem_pred', 'dtc_val_fem_pred',
                  'lr_val_femDem_pred', 'svm_val_femDem_pred', 'knn_val_femDem_pred', 'dtc_val_femDem_pred',
                 'lr_test_fem_pred', 'svm_test_fem_pred', 'knn_test_fem_pred', 'dtc_test_fem_pred',
                 'lr_test_femDem_pred', 'svm_test_femDem_pred', 'knn_test_femDem_pred', 'dtc_test_femDem_pred']
    preds = {}
    preds_index = 0
    
    for test in tests:
            for indicator in indicators:
                lr_pred, lr_accuracy = classify_lr(cleaned_train_senate, cleaned_val_senate, indicator, test)
                svm_pred, svm_accuracy = classify_svm(cleaned_train_senate, cleaned_val_senate, indicator, test)
                knn_pred, knn_accuracy, best_k = classify_knn(cleaned_train_senate, cleaned_val_senate, indicator, test)
                dtc_pred, dtc_accuracy = classify_dtc(cleaned_train_senate, cleaned_val_senate, indicator, test)
                
                preds[preds_keys[preds_index]] = lr_pred
                preds_index +=1
                preds[preds_keys[preds_index]] = svm_pred
                preds_index +=1
                preds[preds_keys[preds_index]] = knn_pred
                preds_index +=1
                preds[preds_keys[preds_index]] = dtc_pred
                preds_index +=1
             
                print(print_outputs[print_index] + str(lr_accuracy))
                print_index += 1
                print(print_outputs[print_index] + str(svm_accuracy))
                print_index += 1
                print(print_outputs[print_index] + str(knn_accuracy))
                print_index += 1
                print(print_outputs[print_index] + str(dtc_accuracy))
                print_index += 1
                print('\n')
    return preds

In [144]:
def house_results():
    tests = [None, cleaned_test_house]
    indicators = ['female', 'female_dem']
    print_outputs = ['House, val, indicator=female, LR: ',
                   'House, val, indicator=female, SVM: ',
                   'House, val, indicator=female, KNN: ',
                   'House, val, indicator=female, DTC: ',
                    'House, val, indicator=femaleDem, LR: ',
                   'House, val, indicator=femaleDem, SVM: ',
                   'House, val, indicator=femaleDem, KNN: ',
                   'House, val, indicator=femaleDem, DTC: ',
                    'House, test, indicator=female, LR: ',
                   'House, test, indicator=female, SVM: ',
                   'House, test, indicator=female, KNN: ',
                   'House, test, indicator=female, DTC: ',
                   'House, test, indicator=femaleDem, LR: ',
                   'House, test, indicator=femaleDem, SVM: ',
                   'House, test, indicator=femaleDem, KNN: ',
                   'House, test, indicator=femaleDem, DTC: ']
    print_index = 0
    preds_keys = ['lr_val_fem_pred', 'svm_val_fem_pred', 'knn_val_fem_pred', 'dtc_val_fem_pred',
                  'lr_val_femDem_pred', 'svm_val_femDem_pred', 'knn_val_femDem_pred', 'dtc_val_femDem_pred',
                 'lr_test_fem_pred', 'svm_test_fem_pred', 'knn_test_fem_pred', 'dtc_test_fem_pred',
                 'lr_test_femDem_pred', 'svm_test_femDem_pred', 'knn_test_femDem_pred', 'dtc_test_femDem_pred']
    preds = {}
    preds_index = 0
    
    for test in tests:
            for indicator in indicators:
                lr_pred, lr_accuracy = classify_lr(cleaned_train_house, cleaned_val_house, indicator, test )
                svm_pred, svm_accuracy = classify_svm(cleaned_train_house, cleaned_val_house, indicator, test)
                knn_pred, knn_accuracy, best_k = classify_knn(cleaned_train_house, cleaned_val_house, indicator, test)
                dtc_pred, dtc_accuracy = classify_dtc(cleaned_train_house, cleaned_val_house, indicator, test)
                
                preds[preds_keys[preds_index]] = lr_pred
                preds_index +=1
                preds[preds_keys[preds_index]] = svm_pred
                preds_index +=1
                preds[preds_keys[preds_index]] = knn_pred
                preds_index +=1
                preds[preds_keys[preds_index]] = dtc_pred
                preds_index +=1
             
                print(print_outputs[print_index] + str(lr_accuracy))
                print_index += 1
                print(print_outputs[print_index] + str(svm_accuracy))
                print_index += 1
                print(print_outputs[print_index] + str(knn_accuracy))
                print_index += 1
                print(print_outputs[print_index] + str(dtc_accuracy))
                print_index += 1
                print('\n')
    return preds

In [145]:
def merged_results():
    tests = [None, cleaned_test_merged, cleaned_test_senate, cleaned_test_house]
    indicators = ['female', 'female_dem']
    print_outputs = ['Merged, val, indicator=female, LR: ',
                   'Merged, val, indicator=female, SVM: ',
                   'Merged, val, indicator=female, KNN: ',
                   'Merged, val, indicator=female, DTC: ',
                    'Merged, val, indicator=femaleDem, LR: ',
                   'Merged, val, indicator=femaleDem, SVM: ',
                   'Merged, val, indicator=femaleDem, KNN: ',
                   'Merged, val, indicator=femaleDem, DTC: ',
                    'Merged, test, indicator=female, LR: ',
                   'Merged, test, indicator=female, SVM: ',
                   'Merged, test, indicator=female, KNN: ',
                   'Merged, test, indicator=female, DTC: ',
                   'Merged, test, indicator=femaleDem, LR: ',
                   'Merged, test, indicator=femaleDem, SVM: ',
                   'Merged, test, indicator=femaleDem, KNN: ',
                   'Merged, test, indicator=femaleDem, DTC: ',
                    'Merged, test on senate, indicator=female, LR: ',
                   'Merged, test on senate, indicator=female, SVM: ',
                   'Merged, test on senate, indicator=female, KNN: ',
                   'Merged, test on senate, indicator=female, DTC: ',
                   'Merged, test on senate, indicator=femaleDem, LR: ',
                   'Merged, test on senate, indicator=femaleDem, SVM: ',
                   'Merged, test on senate, indicator=femaleDem, KNN: ',
                   'Merged, test on senate, indicator=femaleDem, DTC: ',
                    'Merged, test on house, indicator=female, LR: ',
                   'Merged, test on house, indicator=female, SVM: ',
                   'Merged, test on house, indicator=female, KNN: ',
                   'Merged, test on house, indicator=female, DTC: ',
                   'Merged, test on house, indicator=femaleDem, LR: ',
                   'Merged, test on house, indicator=femaleDem, SVM: ',
                   'Merged, test on house, indicator=femaleDem, KNN: ',
                   'Merged, test on house, indicator=femaleDem, DTC: ']
    print_index = 0
    
    print_index = 0
    preds_keys =['lr_val_fem_pred', 'svm_val_fem_pred', 'knn_val_fem_pred', 'dtc_val_fem_pred',
                  'lr_val_femDem_pred', 'svm_val_femDem_pred', 'knn_val_femDem_pred', 'dtc_val_femDem_pred',
                 'lr_test_fem_pred', 'svm_test_fem_pred', 'knn_test_fem_pred', 'dtc_test_fem_pred',
                 'lr_test_femDem_pred', 'svm_test_femDem_pred', 'knn_test_femDem_pred', 'dtc_test_femDem_pred',
                 'lr_testSenate_fem_pred', 'svm_testSenate_fem_pred', 'knn_testSenate_fem_pred', 'dtc_testSenate_fem_pred',
                 'lr_testSenate_femDem_pred', 'svm_testSenate_femDem_pred', 'knn_testSenate_femDem_pred', 'dtc_testSenate_femDem_pred',
                 'lr_testHouse_fem_pred', 'svm_testHouse_fem_pred', 'knn_testHouse_fem_pred', 'dtc_testHouse_fem_pred',
                 'lr_testHouse_femDem_pred', 'svm_testHouse_femDem_pred', 'knn_testHouse_femDem_pred', 'dtc_testHouse_femDem_pred']      
    preds = {}
    preds_index = 0
    
    for test in tests:
            for indicator in indicators:
                lr_pred, lr_accuracy = classify_lr(cleaned_train_merged, cleaned_val_merged, indicator, test)
                svm_pred, svm_accuracy = classify_svm(cleaned_train_merged, cleaned_val_merged, indicator, test)
                knn_pred, knn_accuracy, best_k = classify_knn(cleaned_train_merged, cleaned_val_merged, indicator, test)
                dtc_pred, dtc_accuracy = classify_dtc(cleaned_train_merged, cleaned_val_merged, indicator, test)
                
                preds[preds_keys[preds_index]] = lr_pred
                preds_index +=1
                preds[preds_keys[preds_index]] = svm_pred
                preds_index +=1
                preds[preds_keys[preds_index]] = knn_pred
                preds_index +=1
                preds[preds_keys[preds_index]] = dtc_pred
                preds_index +=1
             
                print(print_outputs[print_index] + str(lr_accuracy))
                print_index += 1
                print(print_outputs[print_index] + str(svm_accuracy))
                print_index += 1
                print(print_outputs[print_index] + str(knn_accuracy))
                print_index += 1
                print(print_outputs[print_index] + str(dtc_accuracy))
                print_index += 1
                print('\n')
    return preds

In [184]:
senate_predictions = senate_results()

Senate, val, indicator=female, LR: 0.682926829268
Senate, val, indicator=female, SVM: 0.756097560976
Senate, val, indicator=female, KNN: 0.756097560976
Senate, val, indicator=female, DTC: 0.768292682927


Senate, val, indicator=femaleDem, LR: 0.634146341463
Senate, val, indicator=femaleDem, SVM: 0.743902439024
Senate, val, indicator=femaleDem, KNN: 0.817073170732
Senate, val, indicator=femaleDem, DTC: 0.670731707317


Senate, test, indicator=female, LR: 0.651162790698
Senate, test, indicator=female, SVM: 0.744186046512
Senate, test, indicator=female, KNN: 0.883720930233
Senate, test, indicator=female, DTC: 0.720930232558


Senate, test, indicator=femaleDem, LR: 0.697674418605
Senate, test, indicator=femaleDem, SVM: 0.767441860465
Senate, test, indicator=femaleDem, KNN: 0.860465116279
Senate, test, indicator=femaleDem, DTC: 0.604651162791




In [185]:
house_predictions = house_results()

House, val, indicator=female, LR: 0.674418604651
House, val, indicator=female, SVM: 0.738372093023
House, val, indicator=female, KNN: 0.866279069767
House, val, indicator=female, DTC: 0.755813953488


House, val, indicator=femaleDem, LR: 0.813953488372
House, val, indicator=femaleDem, SVM: 0.831395348837
House, val, indicator=femaleDem, KNN: 0.877906976744
House, val, indicator=femaleDem, DTC: 0.813953488372


House, test, indicator=female, LR: 0.632530120482
House, test, indicator=female, SVM: 0.704819277108
House, test, indicator=female, KNN: 0.813253012048
House, test, indicator=female, DTC: 0.686746987952


House, test, indicator=femaleDem, LR: 0.777108433735
House, test, indicator=femaleDem, SVM: 0.825301204819
House, test, indicator=femaleDem, KNN: 0.855421686747
House, test, indicator=femaleDem, DTC: 0.753012048193




In [186]:
merged_predictions = merged_results()

Merged, val, indicator=female, LR: 0.657480314961
Merged, val, indicator=female, SVM: 0.759842519685
Merged, val, indicator=female, KNN: 0.854330708661
Merged, val, indicator=female, DTC: 0.665354330709


Merged, val, indicator=femaleDem, LR: 0.653543307087
Merged, val, indicator=femaleDem, SVM: 0.688976377953
Merged, val, indicator=femaleDem, KNN: 0.763779527559
Merged, val, indicator=femaleDem, DTC: 0.59842519685


Merged, test, indicator=female, LR: 0.622009569378
Merged, test, indicator=female, SVM: 0.741626794258
Merged, test, indicator=female, KNN: 0.799043062201
Merged, test, indicator=female, DTC: 0.746411483254


Merged, test, indicator=femaleDem, LR: 0.698564593301
Merged, test, indicator=femaleDem, SVM: 0.741626794258
Merged, test, indicator=femaleDem, KNN: 0.755980861244
Merged, test, indicator=femaleDem, DTC: 0.674641148325


Merged, test on senate, indicator=female, LR: 0.581395348837
Merged, test on senate, indicator=female, SVM: 0.697674418605
Merged, test on senate, in

In [194]:
senate_fem_pred = merged_predictions['svm_testSenate_fem_pred']
senate_femDem_pred = merged_predictions['svm_testSenate_femDem_pred']
house_fem_pred = merged_predictions['svm_testHouse_fem_pred']
house_femDem_pred = merged_predictions['svm_testHouse_femDem_pred']

In [195]:
senate_district_num_dict = {}
house_district_num_dict = {}
district_num_dict = {}

for key, value in senate_name_dict.items():
    senate_district_num_dict[value] = key

for key, value in house_name_dict.items():
    house_district_num_dict[value] = key

for key, value in name_dict.items():
    district_num_dict[value] = key
    
def replace_district(x, type=None):
    if type == "senate":
        return senate_district_num_dict[x]
    elif type == "house":
        return house_district_num_dict[x]
    else:
        return district_num_dict[x]


sen_pred_df = cleaned_test_senate.copy()
sen_pred_df['Probability_Female'] = 0.0
sen_pred_df['Probability_Female_Dem'] = 0.0

sen_pred_df = sen_pred_df[['District','Probability_Female', 'Probability_Female_Dem']]
for i, row in sen_pred_df.iterrows():
    sen_pred_df.set_value(i, 'Probability_Female', senate_fem_pred[i])
    sen_pred_df.set_value(i, 'Probability_Female_Dem', senate_femDem_pred[i])


sen_pred_df.District = sen_pred_df.District.apply(lambda x: replace_district(x, 'senate'))

sen_pred_df = sen_pred_df.groupby('District').mean().reset_index()


house_pred_df = cleaned_test_house.copy()
house_pred_df['Probability_Female'] = 0.0
house_pred_df['Probability_Female_Dem'] = 0.0

house_pred_df = house_pred_df[['District','Probability_Female', 'Probability_Female_Dem']]
for i, row in house_pred_df.iterrows():
    house_pred_df.set_value(i, 'Probability_Female', house_fem_pred[i])
    house_pred_df.set_value(i, 'Probability_Female_Dem', house_femDem_pred[i])

house_pred_df.District = house_pred_df.District.apply(lambda x: replace_district(x, 'house'))

house_pred_df = house_pred_df.groupby('District').mean().reset_index()

In [196]:
sen_pred_df.to_csv('predictions/senate_pred.csv', index=False)
house_pred_df.to_csv('predictions/house_pred.csv', index=False)