In [594]:
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
import numpy as np
from sklearn.tree import DecisionTreeClassifier as DTC
import statsmodels.api as sm
from pandas.core import datetools
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [595]:
def read_train_val_test(train_path, val_path, test_path):
    train_data = pd.read_csv(train_path)
    val_data = pd.read_csv(val_path)
    test_data = pd.read_csv(test_path)
    return train_data, val_data, test_data

In [596]:
senate_name_dict = {}
house_name_dict = {}
name_dict = {}

def enumerate_districts(df, type=None):
    names = df['District']
    count = 0
    for name in names:
        if type == 'senate':
            if name not in senate_name_dict:
                senate_name_dict[name] = count
                count += 1
        elif type == 'house':
            if name not in house_name_dict:
                house_name_dict[name] = count
                count += 1
        else:
            if name not in name_dict:
                name_dict[name] = count
                count += 1

def replace_district(x, type=None):
    if type == "senate":
        return senate_name_dict[x]
    elif type == "house":
        return house_name_dict[x]
    else:
        return name_dict[x]

In [597]:
def clean_data(df, phase, type=None):
    
    # replace district with number 
    df['District'] = df["District"].apply(lambda x: replace_district(x, type))
    
    # code gender and party
    df = df.drop("name", axis=1)
    df['sex'] = df['sex'].fillna(1) #(int(round((df['sex'].mean()))))
    df['sex'].replace('f', 1, inplace=True)
    df['sex'].replace('m', 0, inplace=True)
    df['sex'] = df['sex'].astype(int)
    df['party'].replace('Democratic', 1, inplace=True)
    df['party'].replace('Republican', 0, inplace=True)

    # fill NaN's with mean from column
    df['party'] = df['party'].fillna(df['party'].mean())
    df['Amount'] = df['Amount'].fillna(df['Amount'].mean())   
    df['vote_count'] = df['vote_count'].apply(lambda x: str(x).replace(",", "").replace('nan', 'NaN')).astype(float)
    df['vote_count'] = df['vote_count'].fillna(df['vote_count'].mean())
    df['vote_percent'] = df['vote_percent'].fillna(df['vote_percent'].mean())
        
    # add indicator for female democrat
    df['female_dem'] = 0
    for index, row in df.iterrows():
        if row.sex == 1 and row.party == 1:
            df.set_value(index, 'female_dem', 1)
    
    # remove "(percent) margin of error" columns
    df = df.iloc[:, [index for index, x in enumerate(df.columns) if 'Margin' not in x]]
    
    # remove columns with low percent contributions
    percent_cols = [col for index, col in enumerate(df.columns) if 'Percent' in col and df[col].mean() < 0.05]
    for col in percent_cols:
        df = df.drop(col, axis=1)
        df = df.drop(col.replace("Percent", "Estimate"), axis=1)
        
    if type == None:
        df.to_csv('cleaned_data/cleaned_data_merged_' + phase + '.csv', index = False)
    
    else:
        df.to_csv('cleaned_data/cleaned_data_' + str(type) + '_' + phase + '.csv', index=False)
                  
    return df

In [598]:
senate_train_data, senate_val_data, senate_test_data = read_train_val_test("train_data/merged_senate_districts_2009_2012.csv", "valid_data/merged_senate_districts_2013_2014.csv", "test_data/merged_senate_districts_2015_2016.csv")
enumerate_districts(senate_train_data, 'senate')
enumerate_districts(senate_val_data, 'senate')
enumerate_districts(senate_test_data, 'senate')

house_train_data, house_val_data, house_test_data = read_train_val_test("train_data/merged_house_districts_2009_2012.csv", "valid_data/merged_house_districts_2013_2014.csv", "test_data/merged_house_districts_2015_2016.csv")
enumerate_districts(house_train_data, 'house')
enumerate_districts(house_val_data, 'house')
enumerate_districts(house_test_data, 'house')

merged_train = pd.concat([senate_train_data, house_train_data], axis=0)
merged_val = pd.concat([senate_val_data, house_val_data], axis=0)
merged_test = pd.concat([senate_test_data, house_test_data], axis=0)
enumerate_districts(merged_train)
enumerate_districts(merged_val)
enumerate_districts(merged_test)

cleaned_train_senate = clean_data(senate_train_data, 'train', 'senate')
cleaned_val_senate = clean_data(senate_val_data, 'val', 'senate')
cleaned_test_senate = clean_data(senate_test_data, 'test', 'senate')

cleaned_train_house = clean_data(house_train_data, 'train', 'house')
cleaned_val_house = clean_data(house_val_data, 'val', 'house')
cleaned_test_house = clean_data(house_test_data, 'test', 'house')

cleaned_train_merged = clean_data(merged_train, 'train')
cleaned_val_merged = clean_data(merged_val, 'test')
cleaned_test_merged = clean_data(merged_test, 'val')

In [567]:
def classify_lr(train_df, val_df, indicator, test_df=None):
    
    if indicator == 'female':
        y_train = train_df['sex']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['sex']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)
    
    elif indicator == 'female_dem':
        y_train = train_df['female_dem']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['female_dem']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)
        
#   scale data    
#     X_train = scale(X_train)
#     X_val = scale(X_val)
    
    std_scaler = StandardScaler()
    X_train = std_scaler.fit_transform(X_train)
    X_val = std_scaler.fit_transform(X_val)
    
#     mm_scaler = MinMaxScaler()
#     X_train = mm_scaler.fit_transform(X_train)
#     X_val = mm_scaler.fit_transform(X_val)
        
    classifier = LR()
    classifier.fit(X_train, y_train)
    pred = classifier.predict_proba(X_val)
    pred = np.delete(pred, 1, 1)    
    pred_round =  classifier.predict(X_val)
    print(pred)
                
    if test_df is not None:
        print('here')
        print(X_train.shape)
        print(y_train.shape)
        
        if indicator == 'female':
            y_test = test_df['sex']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)
    
        elif indicator == 'female_dem':
            y_test = test_df['female_dem']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)
        
        test_pred = classifier.predict_proba(X_test)
        #test_pred = np.delete(test_pred, 1, 1)    
        test_pred_round = classifier.predict(X_test)
        
        print(test_pred)
        #print(test_pred_round)
        
        return test_pred, accuracy_score(y_test, test_pred_round)
    
    return pred, accuracy_score(y_val, pred_round)

In [568]:
merged_test_senate_lr_female_preds, merged_test_senate_lr_female_accuracy = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female')

merged_test_senate_lr_femaleDem_preds, merged_test_senate_lr_femaleDem_accuracy = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_senate)


[[ 0.87616329]
 [ 0.89720153]
 [ 0.89720153]
 [ 0.73762693]
 [ 0.73762693]
 [ 0.56263146]
 [ 0.72627923]
 [ 0.56263146]
 [ 0.72627923]
 [ 0.77548632]
 [ 0.77548632]
 [ 0.33200142]
 [ 0.33200142]
 [ 0.73746523]
 [ 0.73746523]
 [ 0.35640211]
 [ 0.35640211]
 [ 0.21655805]
 [ 0.21655805]
 [ 0.43597393]
 [ 0.43597393]
 [ 0.83421019]
 [ 0.83421019]
 [ 0.93399075]
 [ 0.96905367]
 [ 0.96905367]
 [ 0.99942609]
 [ 0.99942609]
 [ 0.79158071]
 [ 0.79158071]
 [ 0.9270574 ]
 [ 0.9270574 ]
 [ 0.83393972]
 [ 0.83393972]
 [ 0.8615711 ]
 [ 0.8615711 ]
 [ 0.99936331]
 [ 0.99936331]
 [ 0.99925069]
 [ 0.99925069]
 [ 0.97146314]
 [ 0.97146314]
 [ 0.96833952]
 [ 0.96833952]
 [ 0.85607807]
 [ 0.85607807]
 [ 0.98155321]
 [ 0.98155321]
 [ 0.91584121]
 [ 0.91584121]
 [ 0.91758514]
 [ 0.91758514]
 [ 0.42231954]
 [ 0.42231954]
 [ 0.83612968]
 [ 0.83612968]
 [ 0.85373305]
 [ 0.85373305]
 [ 0.78578609]
 [ 0.83937595]
 [ 0.83937595]
 [ 0.66141366]
 [ 0.66141366]
 [ 0.94897817]
 [ 0.94897817]
 [ 0.64221405]
 [ 0.64221

  np.exp(prob, prob)


In [470]:
def classify_svm(train_df, val_df, indicator, test_df=None):

    if indicator == 'female':
        y_train = train_df['sex']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['sex']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)

    elif indicator == 'female_dem':
        y_train = train_df['female_dem']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['female_dem']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)
        
#   scale data    
#     X_train = scale(X_train)
#     X_val = scale(X_val)
    
    std_scaler = StandardScaler()
    X_train = std_scaler.fit_transform(X_train)
    X_val = std_scaler.fit_transform(X_val)
    
#     mm_scaler = MinMaxScaler()
#     X_train = mm_scaler.fit_transform(X_train)
#     X_val = mm_scaler.fit_transform(X_val)
    
    classifier = SVC(probability=True)
    classifier.fit(X_train, y_train)
      
    pred = classifier.predict_proba(X_val)
    pred = np.delete(pred, 1, 1)    
    pred_round = classifier.predict(X_val)
    
    if test_df is not None:
        
        if indicator == 'female':
            y_test = test_df['sex']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)

        elif indicator == 'female_dem':
            y_test = test_df['female_dem']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)

        test_pred = classifier.predict_proba(X_test)
        test_pred = np.delete(test_pred, 1, 1)    
        test_pred_round = classifier.predict(X_test)

        return test_pred, accuracy_score(y_test, test_pred_round)
    
    return pred, accuracy_score(y_val, pred_round)

In [471]:
import math

def classify_knn(train_df, val_df, indicator, test_df=None):

    if indicator == 'female':
        y_train = train_df['sex']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['sex']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)

    elif indicator == 'female_dem':
        y_train = train_df['female_dem']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['female_dem']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)
        
#   scale data    
#     X_train = scale(X_train)
#     X_val = scale(X_val)
    
    std_scaler = StandardScaler()
    X_train = std_scaler.fit_transform(X_train)
    X_val = std_scaler.fit_transform(X_val)
    
#     mm_scaler = MinMaxScaler()
#     X_train = mm_scaler.fit_transform(X_train)
#     X_val = mm_scaler.fit_transform(X_val)
    
    neighbors = list(range(1,16))
    accuracy = []
    preds = np.zeros((y_val.shape[0],1))
    preds_round = np.zeros((y_val.shape[0],1))
        
    for k in neighbors:
        classifier = KNN(n_neighbors=k)
        classifier.fit(X_train, y_train)
        k_pred = classifier.predict_proba(X_val)
        k_pred = np.delete(k_pred, 1, 1) 
        preds = np.concatenate((preds, k_pred), axis=1)
        k_pred_round = (classifier.predict(X_val)).reshape((y_val.shape[0],1))
        preds_round = np.concatenate((preds_round, k_pred_round), axis=1)
        accuracy.append(accuracy_score(y_val, k_pred_round))
        
    preds = np.delete(preds, 0, 1)
    preds_round = np.delete(preds_round, 0, 1)
    
    best_k = None
    best_accuracy = -math.inf
    
    for k, acc in enumerate(accuracy):
        if acc > best_accuracy:
            best_accuracy = acc
            best_k = k + 1  
    
    pred = preds[:,best_k]
    pred_round = preds_round[:,best_k]
    classifier = KNN(n_neighbors=best_k)
    classifier.fit(X_train, y_train)
        
    if test_df is not None:
        
        if indicator == 'female':
            y_test = test_df['sex']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)
    
        elif indicator == 'female_dem':
            y_test = test_df['female_dem']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)

        test_pred = classifier.predict_proba(X_test)
        test_pred = np.delete(test_pred, 1, 1)    
        test_pred_round = classifier.predict(X_test)
        
        return test_pred, accuracy_score(y_test, test_pred_round), best_k

    return pred, accuracy_score(y_val, pred_round), best_k

In [472]:
def classify_dtc(train_df, val_df, indicator, test_df=None):
    
    if indicator == 'female':
        y_train = train_df['sex']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['sex']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)

    elif indicator == 'female_dem':
        y_train = train_df['female_dem']
        X_train = train_df.drop(['sex', 'female_dem'], axis=1)

        y_val = val_df['female_dem']
        X_val = val_df.drop(['sex', 'female_dem'], axis=1)
        
#   scale data    
#     X_train = scale(X_train)
#     X_val = scale(X_val)
    
    std_scaler = StandardScaler()
    X_train = std_scaler.fit_transform(X_train)
    X_val = std_scaler.fit_transform(X_val)
    
#     mm_scaler = MinMaxScaler()
#     X_train = mm_scaler.fit_transform(X_train)
#     X_val = mm_scaler.fit_transform(X_val)
    
    classifier_gini = DTC(random_state=40)
    classifier_entropy = DTC(criterion='entropy', random_state=40)
    
    classifier_gini.fit(X_train, y_train)
    classifier_entropy.fit(X_train, y_train)
    
    pred_gini = classifier_gini.predict_proba(X_val)
    pred_gini = np.delete(pred_gini, 1, 1)    
    pred_entropy = classifier_entropy.predict_proba(X_val)
    pred_entropy = np.delete(pred_entropy, 1, 1)    
    pred_round_gini = classifier_gini.predict(X_val)
    pred_round_entropy = classifier_entropy.predict(X_val)
    
    accuracy_score_gini = accuracy_score(y_val, pred_round_gini)
    accuracy_score_entropy = accuracy_score(y_val, pred_round_entropy)
    
    classifer = None
    best_accuracy = None
    best_preds = None
    
    if accuracy_score_gini > accuracy_score_entropy:
        classifier = DTC(random_state=40)
        classifier.fit(X_train, y_train)

        best_accuracy = accuracy_score_gini
        best_preds = pred_round_gini
    
    else:
        classifier = DTC(criterion='entropy', random_state=40)
        classifier.fit(X_train, y_train)

        best_accuracy = accuracy_score_entropy
        best_preds = pred_round_entropy        
    
    if test_df is not None:
        
        if indicator == 'female':
            y_test = test_df['sex']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)
    
        elif indicator == 'female_dem':
            y_test = test_df['female_dem']
            X_test = test_df.drop(['sex', 'female_dem'], axis=1)

        test_pred = classifier.predict_proba(X_test)
        test_pred = np.delete(test_pred, 1, 1)    
        test_pred_round = classifier.predict(X_test)
        
        return test_pred, accuracy_score(y_test, test_pred_round)
        
    return best_preds, best_accuracy

In [473]:
# senate data val results, indicator = 'female'
senate_val_lr_female_preds = classify_lr(cleaned_train_senate, cleaned_val_senate, 'female')[0]
senate_val_lr_female_accuracy = classify_lr(cleaned_train_senate, cleaned_val_senate, 'female')[1]

senate_val_svm_female_preds = classify_svm(cleaned_train_senate, cleaned_val_senate, 'female')[0]
senate_val_svm_female_accuracy = classify_svm(cleaned_train_senate, cleaned_val_senate, 'female')[1]

senate_val_knn_female_preds = classify_knn(cleaned_train_senate, cleaned_val_senate, 'female')[0]
senate_val_knn_female_accuracy = classify_knn(cleaned_train_senate, cleaned_val_senate, 'female')[1]
senate_val_knn_female_bestk = classify_knn(cleaned_train_senate, cleaned_val_senate, 'female')[2]

senate_val_dtc_female_preds = classify_dtc(cleaned_train_senate, cleaned_val_senate, 'female')[0]
senate_val_dtc_female_accuracy = classify_dtc(cleaned_train_senate, cleaned_val_senate, 'female')[1]


# senate data val results, indicator = 'female_dem'
senate_val_lr_femaleDem_preds = classify_lr(cleaned_train_senate, cleaned_val_senate, 'female_dem')[0]
senate_val_lr_femaleDem_accuracy = classify_lr(cleaned_train_senate, cleaned_val_senate, 'female_dem')[1]

senate_val_svm_femaleDem_preds = classify_svm(cleaned_train_senate, cleaned_val_senate, 'female_dem')[0]
senate_val_svm_femaleDem_accuracy = classify_svm(cleaned_train_senate, cleaned_val_senate, 'female_dem')[1]

senate_val_knn_femaleDem_preds = classify_knn(cleaned_train_senate, cleaned_val_senate, 'female_dem')[0]
senate_val_knn_femaleDem_accuracy = classify_knn(cleaned_train_senate, cleaned_val_senate, 'female_dem')[1]
senate_val_knn_femaleDem_bestk = classify_knn(cleaned_train_senate, cleaned_val_senate, 'female_dem')[2]

senate_val_dtc_femaleDem_preds = classify_dtc(cleaned_train_senate, cleaned_val_senate, 'female_dem')[0]
senate_val_dtc_femaleDem_accuracy = classify_dtc(cleaned_train_senate, cleaned_val_senate, 'female_dem')[1]

# senate data test results, indicator = 'female'
senate_test_lr_female_preds = classify_lr(cleaned_train_senate, cleaned_val_senate, 'female', cleaned_test_senate)[0]
senate_test_lr_female_accuracy = classify_lr(cleaned_train_senate, cleaned_val_senate, 'female', cleaned_test_senate)[1]

senate_test_svm_female_preds = classify_svm(cleaned_train_senate, cleaned_val_senate, 'female', cleaned_test_senate)[0]
senate_test_svm_female_accuracy = classify_svm(cleaned_train_senate, cleaned_val_senate, 'female', cleaned_test_senate)[1]

senate_test_knn_female_preds = classify_knn(cleaned_train_senate, cleaned_val_senate, 'female', cleaned_test_senate)[0]
senate_test_knn_female_accuracy = classify_knn(cleaned_train_senate, cleaned_val_senate, 'female', cleaned_test_senate)[1]
senate_test_knn_female_bestk = classify_knn(cleaned_train_senate, cleaned_val_senate, 'female', cleaned_test_senate)[2]

senate_test_dtc_female_preds = classify_dtc(cleaned_train_senate, cleaned_val_senate, 'female', cleaned_test_senate)[0]
senate_test_dtc_female_accuracy = classify_dtc(cleaned_train_senate, cleaned_val_senate, 'female', cleaned_test_senate)[1]


# senate data test results, indicator = 'female_dem'
senate_test_lr_femaleDem_preds = classify_lr(cleaned_train_senate, cleaned_val_senate, 'female_dem', cleaned_test_senate)[0]
senate_test_lr_femaleDem_accuracy = classify_lr(cleaned_train_senate, cleaned_val_senate, 'female_dem', cleaned_test_senate)[1]

senate_test_svm_femaleDem_preds = classify_svm(cleaned_train_senate, cleaned_val_senate, 'female_dem', cleaned_test_senate)[0]
senate_test_svm_femaleDem_accuracy = classify_svm(cleaned_train_senate, cleaned_val_senate, 'female_dem', cleaned_test_senate)[1]

senate_test_knn_femaleDem_preds = classify_knn(cleaned_train_senate, cleaned_val_senate, 'female_dem', cleaned_test_senate)[0]
senate_test_knn_femaleDem_accuracy = classify_knn(cleaned_train_senate, cleaned_val_senate, 'female_dem', cleaned_test_senate)[1]
senate_test_knn_femaleDem_bestk = classify_knn(cleaned_train_senate, cleaned_val_senate, 'female_dem', cleaned_test_senate)[2]

senate_test_dtc_femaleDem_preds = classify_dtc(cleaned_train_senate, cleaned_val_senate, 'female_dem', cleaned_test_senate)[0]
senate_test_dtc_femaleDem_accuracy = classify_dtc(cleaned_train_senate, cleaned_val_senate, 'female_dem', cleaned_test_senate)[1]


  np.exp(prob, prob)


In [474]:
# house data val results, indicator = 'female'
house_val_lr_female_preds = classify_lr(cleaned_train_house, cleaned_val_house, 'female')[0]
house_val_lr_female_accuracy = classify_lr(cleaned_train_house, cleaned_val_house, 'female')[1]

house_val_svm_female_preds = classify_svm(cleaned_train_house, cleaned_val_house, 'female')[0]
house_val_svm_female_accuracy = classify_svm(cleaned_train_house, cleaned_val_house, 'female')[1]

house_val_knn_female_preds = classify_knn(cleaned_train_house, cleaned_val_house, 'female')[0]
house_val_knn_female_accuracy = classify_knn(cleaned_train_house, cleaned_val_house, 'female')[1]
house_val_knn_female_bestk = classify_knn(cleaned_train_house, cleaned_val_house, 'female')[2]

house_val_dtc_female_preds = classify_dtc(cleaned_train_house, cleaned_val_house, 'female')[0]
house_val_dtc_female_accuracy = classify_dtc(cleaned_train_house, cleaned_val_house, 'female')[1]


# house data val results, indicator = 'female_dem'
house_val_lr_femaleDem_preds = classify_lr(cleaned_train_house, cleaned_val_house, 'female_dem')[0]
house_val_lr_femaleDem_accuracy = classify_lr(cleaned_train_house, cleaned_val_house, 'female_dem')[1]

house_val_svm_femaleDem_preds = classify_svm(cleaned_train_house, cleaned_val_house, 'female_dem')[0]
house_val_svm_femaleDem_accuracy = classify_svm(cleaned_train_house, cleaned_val_house, 'female_dem')[1]

house_val_knn_femaleDem_preds = classify_knn(cleaned_train_house, cleaned_val_house, 'female_dem')[0]
house_val_knn_femaleDem_accuracy = classify_knn(cleaned_train_house, cleaned_val_house, 'female_dem')[1]
house_val_knn_femaleDem_bestk = classify_knn(cleaned_train_house, cleaned_val_house, 'female_dem')[2]

house_val_dtc_femaleDem_preds = classify_dtc(cleaned_train_house, cleaned_val_house, 'female_dem')[0]
house_val_dtc_femaleDem_accuracy = classify_dtc(cleaned_train_house, cleaned_val_house, 'female_dem')[1]

# house data test results, indicator = 'female'
house_test_lr_female_preds = classify_lr(cleaned_train_house, cleaned_val_house, 'female', cleaned_test_house)[0]
house_test_lr_female_accuracy = classify_lr(cleaned_train_house, cleaned_val_house, 'female', cleaned_test_house)[1]

house_test_svm_female_preds = classify_svm(cleaned_train_house, cleaned_val_house, 'female', cleaned_test_house)[0]
house_test_svm_female_accuracy = classify_svm(cleaned_train_house, cleaned_val_house, 'female', cleaned_test_house)[1]

house_test_knn_female_preds = classify_knn(cleaned_train_house, cleaned_val_house, 'female', cleaned_test_house)[0]
house_test_knn_female_accuracy = classify_knn(cleaned_train_house, cleaned_val_house, 'female', cleaned_test_house)[1]
house_test_knn_female_bestk = classify_knn(cleaned_train_house, cleaned_val_house, 'female', cleaned_test_house)[2]

house_test_dtc_female_preds = classify_dtc(cleaned_train_house, cleaned_val_house, 'female', cleaned_test_house)[0]
house_test_dtc_female_accuracy = classify_dtc(cleaned_train_house, cleaned_val_house, 'female', cleaned_test_house)[1]


# house data test results, indicator = 'female_dem'
house_test_lr_femaleDem_preds = classify_lr(cleaned_train_house, cleaned_val_house, 'female_dem', cleaned_test_house)[0]
house_test_lr_femaleDem_accuracy = classify_lr(cleaned_train_house, cleaned_val_house, 'female_dem', cleaned_test_house)[1]

house_test_svm_femaleDem_preds = classify_svm(cleaned_train_house, cleaned_val_house, 'female_dem', cleaned_test_house)[0]
house_test_svm_femaleDem_accuracy = classify_svm(cleaned_train_house, cleaned_val_house, 'female_dem', cleaned_test_house)[1]

house_test_knn_femaleDem_preds = classify_knn(cleaned_train_house, cleaned_val_house, 'female_dem', cleaned_test_house)[0]
house_test_knn_femaleDem_accuracy = classify_knn(cleaned_train_house, cleaned_val_house, 'female_dem', cleaned_test_house)[1]
house_test_knn_femaleDem_bestk = classify_knn(cleaned_train_house, cleaned_val_house, 'female_dem', cleaned_test_house)[2]

house_test_dtc_femaleDem_preds = classify_dtc(cleaned_train_house, cleaned_val_house, 'female_dem', cleaned_test_house)[0]
house_test_dtc_femaleDem_accuracy = classify_dtc(cleaned_train_house, cleaned_val_house, 'female_dem', cleaned_test_house)[1]

  np.exp(prob, prob)


In [475]:
# merged data val results, indicator = 'female'
merged_val_lr_female_preds = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female')[0]
merged_val_lr_female_accuracy = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female')[1]

merged_val_svm_female_preds = classify_svm(cleaned_train_merged, cleaned_val_merged, 'female')[0]
merged_val_svm_female_accuracy = classify_svm(cleaned_train_merged, cleaned_val_merged, 'female')[1]

merged_val_knn_female_preds = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female')[0]
merged_val_knn_female_accuracy = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female')[1]
merged_val_knn_female_bestk = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female')[2]

merged_val_dtc_female_preds = classify_dtc(cleaned_train_merged, cleaned_val_merged, 'female')[0]
merged_val_dtc_female_accuracy = classify_dtc(cleaned_train_merged, cleaned_val_merged, 'female')[1]


# merged data val results, indicator = 'female_dem'
merged_val_lr_femaleDem_preds = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female_dem')[0]
merged_val_lr_femaleDem_accuracy = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female_dem')[1]

merged_val_svm_femaleDem_preds = classify_svm(cleaned_train_merged, cleaned_val_merged, 'female_dem')[0]
merged_val_svm_femaleDem_accuracy = classify_svm(cleaned_train_merged, cleaned_val_merged, 'female_dem')[1]

merged_val_knn_femaleDem_preds = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female_dem')[0]
merged_val_knn_femaleDem_accuracy = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female_dem')[1]
merged_val_knn_femaleDem_bestk = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female_dem')[2]

merged_val_dtc_femaleDem_preds = classify_dtc(cleaned_train_merged, cleaned_val_merged, 'female_dem')[0]
merged_val_dtc_femaleDem_accuracy = classify_dtc(cleaned_train_merged, cleaned_val_merged, 'female_dem')[1]

# merged data test results, indicator = 'female'
merged_test_lr_female_preds = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_merged)[0]
merged_test_lr_female_accuracy = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_merged)[1]

merged_test_svm_female_preds = classify_svm(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_merged)[0]
merged_test_svm_female_accuracy = classify_svm(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_merged)[1]

merged_test_knn_female_preds = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_merged)[0]
merged_test_knn_female_accuracy = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_merged)[1]
merged_test_knn_female_bestk = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_merged)[2]

merged_test_dtc_female_preds = classify_dtc(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_merged)[0]
merged_test_dtc_female_accuracy = classify_dtc(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_merged)[1]


# merged data test results, indicator = 'female_dem'
merged_test_lr_femaleDem_preds = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_merged)[0]
merged_test_lr_femaleDem_accuracy = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_merged)[1]

merged_test_svm_femaleDem_preds = classify_svm(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_merged)[0]
merged_test_svm_femaleDem_accuracy = classify_svm(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_merged)[1]

merged_test_knn_femaleDem_preds = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_merged)[0]
merged_test_knn_femaleDem_accuracy = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_merged)[1]
merged_test_knn_femaleDem_bestk = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_merged)[2]

merged_test_dtc_femaleDem_preds = classify_dtc(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_merged)[0]
merged_test_dtc_femaleDem_accuracy = classify_dtc(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_merged)[1]

  np.exp(prob, prob)


In [497]:
# merged data, test on senate results, indicator = 'female'
merged_test_senate_lr_female_preds, merged_test_senate_lr_female_accuracy = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_senate)

merged_test_senate_svm_female_preds, merged_test_senate_svm_female_accuracy= classify_svm(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_senate)

merged_test_senate_knn_female_preds, merged_test_senate_knn_female_accuracy, merged_test_senate_knn_female_bestk = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_senate)

merged_test_senate_dtc_female_preds, merged_test_senate_dtc_female_accuracy = classify_dtc(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_senate)


# merged data test on senate results, indicator = 'female_dem'
merged_test_senate_lr_femaleDem_preds, merged_test_senate_lr_femaleDem_accuracy = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_senate)

merged_test_senate_svm_femaleDem_preds, merged_test_senate_svm_femaleDem_accuracy  = classify_svm(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_senate)

merged_test_senate_knn_femaleDem_preds, merged_test_senate_knn_femaleDem_accuracy, merged_test_senate_knn_femaleDem_bestk = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_senate)

merged_test_senate_dtc_femaleDem_preds, merged_test_senate_dtc_femaleDem_accuracy = classify_dtc(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_senate)


# merged data test on house results, indicator = 'female'
merged_test_house_lr_female_preds, merged_test_house_lr_female_accuracy = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_house)

merged_test_house_svm_female_preds, merged_test_house_svm_female_accuracy = classify_svm(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_house)

merged_test_house_knn_female_preds, merged_test_house_knn_female_accuracy, merged_test_house_knn_female_bestk = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_house)

merged_test_house_dtc_female_preds, merged_test_house_dtc_female_accuracy = classify_dtc(cleaned_train_merged, cleaned_val_merged, 'female', cleaned_test_house)


# merged data test on house results, indicator = 'female_dem'
merged_test_house_lr_femaleDem_preds, merged_test_house_lr_femaleDem_accuracy = classify_lr(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_house)

merged_test_house_svm_femaleDem_preds, merged_test_house_svm_femaleDem_accuracy = classify_svm(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_house)

merged_test_house_knn_femaleDem_preds, merged_test_house_knn_femaleDem_accuracy, merged_test_house_knn_femaleDem_bestk = classify_knn(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_house)

merged_test_house_dtc_femaleDem_preds, merged_test_house_dtc_femaleDem_accuracy = classify_dtc(cleaned_train_merged, cleaned_val_merged, 'female_dem', cleaned_test_house)

  np.exp(prob, prob)


[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

In [493]:
senate_district_num_dict = {}
house_district_num_dict = {}
district_num_dict = {}

for key, value in senate_name_dict.items():
    senate_district_num_dict[value] = key

for key, value in senate_name_dict.items():
    senate_district_num_dict[value] = key

for key, value in name_dict.items():
    district_num_dict[value] = key
    

sen_pred_df = cleaned_test_senate.copy()
print(len(sen_pred_df))
print(len(merged_test_senate_lr_female_preds))

sen_pred_df['Probability_Female'] = 0 
sen_pred_df['Probability_Female_Dem'] = 0 
sen_pred_df = sen_pred_df[['District','Probability_Female', 'Probability_Female_Dem']]
for i, row in sen_pred_df.iterrows():
    sen_pred_df.set_value(i, 'Probability_Female', merged_test_senate_lr_female_preds[i])
    sen_pred_df.set_value(i, 'Probability_Female_Dem', merged_test_senate_lr_femaleDem_preds[i])

print(sen_pred_df)

85
85
    District  Probability_Female  Probability_Female_Dem
0         40                   1                       1
1          1                   1                       1
2          1                   1                       1
3          2                   1                       1
4          2                   1                       1
5         41                   1                       1
6         41                   1                       1
7          3                   1                       1
8          3                   1                       1
9          4                   1                       1
10         4                   1                       1
11         5                   1                       1
12         5                   1                       1
13         6                   1                       1
14         6                   1                       1
15         7                   1                       1
16         7             

In [478]:
# Senate accuracies

print('Senate, val, indicator=female, LR: ' + str(senate_val_lr_female_accuracy))
print('Senate, val, indicator=female, SVM: ' + str(senate_val_svm_female_accuracy))
print('Senate, val, indicator=female, KNN: ' + str(senate_val_knn_female_accuracy))
print('Senate, val, indicator=female, DTC: ' + str(senate_val_dtc_female_accuracy))
print('\n')
print('Senate, val, indicator=female_dem, LR: ' + str(senate_val_lr_femaleDem_accuracy))
print('Senate, val, indicator=female_dem, SVM: ' + str(senate_val_svm_femaleDem_accuracy))
print('Senate, val, indicator=female_dem, KNN: ' + str(senate_val_knn_femaleDem_accuracy))
print('Senate, val, indicator=female_dem, DTC: ' + str(senate_val_dtc_femaleDem_accuracy))

print('\n')

print('Senate, test, indicator=female, LR: ' + str(senate_test_lr_female_accuracy))
print('Senate, test, indicator=female, SVM: ' + str(senate_test_svm_female_accuracy))
print('Senate, test, indicator=female, KNN: ' + str(senate_test_knn_female_accuracy))
print('Senate, test, indicator=female, DTC: ' + str(senate_test_dtc_female_accuracy))
print('\n')
print('Senate, test, indicator=female_dem, LR: ' + str(senate_test_lr_femaleDem_accuracy))
print('Senate, test, indicator=female_dem, SVM: ' + str(senate_test_svm_femaleDem_accuracy))
print('Senate, test, indicator=female_dem, KNN: ' + str(senate_test_knn_femaleDem_accuracy))
print('Senate, test, indicator=female_dem, DTC: ' + str(senate_test_dtc_femaleDem_accuracy))

Senate, val, indicator=female, LR: 0.719512195122
Senate, val, indicator=female, SVM: 0.743902439024
Senate, val, indicator=female, KNN: 0.817073170732
Senate, val, indicator=female, DTC: 0.719512195122


Senate, val, indicator=female_dem, LR: 0.719512195122
Senate, val, indicator=female_dem, SVM: 0.743902439024
Senate, val, indicator=female_dem, KNN: 0.817073170732
Senate, val, indicator=female_dem, DTC: 0.719512195122


Senate, test, indicator=female, LR: 0.705882352941
Senate, test, indicator=female, SVM: 0.717647058824
Senate, test, indicator=female, KNN: 0.305882352941
Senate, test, indicator=female, DTC: 0.282352941176


Senate, test, indicator=female_dem, LR: 0.705882352941
Senate, test, indicator=female_dem, SVM: 0.717647058824
Senate, test, indicator=female_dem, KNN: 0.305882352941
Senate, test, indicator=female_dem, DTC: 0.282352941176


In [479]:
# House accuracies

print('House, val, indicator=female, LR: ' + str(house_val_lr_female_accuracy))
print('House, val, indicator=female, SVM: ' + str(house_val_svm_female_accuracy))
print('House, val, indicator=female, KNN: ' + str(house_val_knn_female_accuracy))
print('House, val, indicator=female, DTC: ' + str(house_val_dtc_female_accuracy))
print('\n')
print('House, val, indicator=female_dem, LR: ' + str(house_val_lr_femaleDem_accuracy))
print('House, val, indicator=female_dem, SVM: ' + str(house_val_svm_femaleDem_accuracy))
print('House, val, indicator=female_dem, KNN: ' + str(house_val_knn_femaleDem_accuracy))
print('House, val, indicator=female_dem, DTC: ' + str(house_val_dtc_femaleDem_accuracy))

print('\n')

print('House, test, indicator=female, LR: ' + str(house_test_lr_female_accuracy))
print('House, test, indicator=female, SVM: ' + str(house_test_svm_female_accuracy))
print('House, test, indicator=female, KNN: ' + str(house_test_knn_female_accuracy))
print('House, test, indicator=female, DTC: ' + str(house_test_dtc_female_accuracy))
print('\n')
print('House, test, indicator=female_dem, LR: ' + str(house_test_lr_femaleDem_accuracy))
print('House, test, indicator=female_dem, SVM: ' + str(house_test_svm_femaleDem_accuracy))
print('House, test, indicator=female_dem, KNN: ' + str(house_test_knn_femaleDem_accuracy))
print('House, test, indicator=female_dem, DTC: ' + str(house_test_dtc_femaleDem_accuracy))

House, val, indicator=female, LR: 0.78488372093
House, val, indicator=female, SVM: 0.78488372093
House, val, indicator=female, KNN: 0.854651162791
House, val, indicator=female, DTC: 0.755813953488


House, val, indicator=female_dem, LR: 0.843023255814
House, val, indicator=female_dem, SVM: 0.837209302326
House, val, indicator=female_dem, KNN: 0.895348837209
House, val, indicator=female_dem, DTC: 0.813953488372


House, test, indicator=female, LR: 0.734939759036
House, test, indicator=female, SVM: 0.765060240964
House, test, indicator=female, KNN: 0.765060240964
House, test, indicator=female, DTC: 0.234939759036


House, test, indicator=female_dem, LR: 0.78313253012
House, test, indicator=female_dem, SVM: 0.825301204819
House, test, indicator=female_dem, KNN: 0.825301204819
House, test, indicator=female_dem, DTC: 0.475903614458


In [480]:
# Merged accuracies

print('Merged, val, indicator=female, LR: ' + str(merged_val_lr_female_accuracy))
print('Merged, val, indicator=female, SVM: ' + str(merged_val_svm_female_accuracy))
print('Merged, val, indicator=female, KNN: ' + str(merged_val_knn_female_accuracy))
print('Merged, val, indicator=female, DTC: ' + str(merged_val_dtc_female_accuracy))
print('\n')
print('Merged, val, indicator=female_dem, LR: ' + str(merged_val_lr_femaleDem_accuracy))
print('Merged, val, indicator=female_dem, SVM: ' + str(merged_val_svm_femaleDem_accuracy))
print('Merged, val, indicator=female_dem, KNN: ' + str(merged_val_knn_femaleDem_accuracy))
print('Merged, val, indicator=female_dem, DTC: ' + str(merged_val_dtc_femaleDem_accuracy))

print('\n')

print('Merged, test, indicator=female, LR: ' + str(merged_test_lr_female_accuracy))
print('Merged, test, indicator=female, SVM: ' + str(merged_test_svm_female_accuracy))
print('Merged, test, indicator=female, KNN: ' + str(merged_test_knn_female_accuracy))
print('Merged, test, indicator=female, DTC: ' + str(merged_test_dtc_female_accuracy))
print('\n')
print('Merged, test, indicator=female_dem, LR: ' + str(merged_test_lr_femaleDem_accuracy))
print('Merged, test, indicator=female_dem, SVM: ' + str(merged_test_svm_femaleDem_accuracy))
print('Merged, test, indicator=female_dem, KNN: ' + str(merged_test_knn_femaleDem_accuracy))
print('Merged, test, indicator=female_dem, DTC: ' + str(merged_test_dtc_femaleDem_accuracy))

Merged, val, indicator=female, LR: 0.759842519685
Merged, val, indicator=female, SVM: 0.763779527559
Merged, val, indicator=female, KNN: 0.854330708661
Merged, val, indicator=female, DTC: 0.692913385827


Merged, val, indicator=female_dem, LR: 0.665354330709
Merged, val, indicator=female_dem, SVM: 0.688976377953
Merged, val, indicator=female_dem, KNN: 0.775590551181
Merged, val, indicator=female_dem, DTC: 0.629921259843


Merged, test, indicator=female, LR: 0.749003984064
Merged, test, indicator=female, SVM: 0.749003984064
Merged, test, indicator=female, KNN: 0.573705179283
Merged, test, indicator=female, DTC: 0.745019920319


Merged, test, indicator=female_dem, LR: 0.649402390438
Merged, test, indicator=female_dem, SVM: 0.649402390438
Merged, test, indicator=female_dem, KNN: 0.414342629482
Merged, test, indicator=female_dem, DTC: 0.350597609562


In [579]:
print('Merged, test on senate, indicator=female, LR: ' + str(merged_test_senate_lr_female_accuracy))
print('Merged, test on senate, indicator=female, SVM: ' + str(merged_test_senate_svm_female_accuracy))
print('Merged, test on senate, indicator=female, KNN: ' + str(merged_test_senate_knn_female_accuracy))
print('Merged, test on senate, indicator=female, DTC: ' + str(merged_test_senate_dtc_female_accuracy))
print('\n')
print('Merged, test on senate, indicator=female_dem, LR: ' + str(merged_test_senate_lr_femaleDem_accuracy))
print('Merged, test on senate, indicator=female_dem, SVM: ' + str(merged_test_senate_svm_femaleDem_accuracy))
print('Merged, test on senate, indicator=female_dem, KNN: ' + str(merged_test_senate_knn_femaleDem_accuracy))
print('Merged, test on senate, indicator=female_dem, DTC: ' + str(merged_test_senate_dtc_femaleDem_accuracy))

Merged, test on senate, indicator=female, LR: 0.759842519685
Merged, test on senate, indicator=female, SVM: 0.717647058824
Merged, test on senate, indicator=female, KNN: 0.623529411765
Merged, test on senate, indicator=female, DTC: 0.717647058824


Merged, test on senate, indicator=female_dem, LR: 0.717647058824
Merged, test on senate, indicator=female_dem, SVM: 0.717647058824
Merged, test on senate, indicator=female_dem, KNN: 0.388235294118
Merged, test on senate, indicator=female_dem, DTC: 0.282352941176


In [482]:
print('Merged, test on house, indicator=female, LR: ' + str(merged_test_house_lr_female_accuracy))
print('Merged, test on house, indicator=female, SVM: ' + str(merged_test_house_svm_female_accuracy))
print('Merged, test on house, indicator=female, KNN: ' + str(merged_test_house_knn_female_accuracy))
print('Merged, test on house, indicator=female, DTC: ' + str(merged_test_house_dtc_female_accuracy))
print('\n')
print('Merged, test on house, indicator=female_dem, LR: ' + str(merged_test_house_lr_femaleDem_accuracy))
print('Merged, test on house, indicator=female_dem, SVM: ' + str(merged_test_house_svm_femaleDem_accuracy))
print('Merged, test on house, indicator=female_dem, KNN: ' + str(merged_test_house_knn_femaleDem_accuracy))
print('Merged, test on house, indicator=female_dem, DTC: ' + str(merged_test_house_dtc_femaleDem_accuracy))

Merged, test on house, indicator=female, LR: 0.765060240964
Merged, test on house, indicator=female, SVM: 0.765060240964
Merged, test on house, indicator=female, KNN: 0.55421686747
Merged, test on house, indicator=female, DTC: 0.759036144578


Merged, test on house, indicator=female_dem, LR: 0.825301204819
Merged, test on house, indicator=female_dem, SVM: 0.825301204819
Merged, test on house, indicator=female_dem, KNN: 0.39156626506
Merged, test on house, indicator=female_dem, DTC: 0.174698795181


In [592]:
senate_district_num_dict = {}
house_district_num_dict = {}
district_num_dict = {}

for key, value in senate_name_dict.items():
    senate_district_num_dict[value] = key

for key, value in house_name_dict.items():
    house_district_num_dict[value] = key

for key, value in name_dict.items():
    district_num_dict[value] = key
    
def replace_district(x, type=None):
    if type == "senate":
        return senate_district_num_dict[x]
    elif type == "house":
        return house_district_num_dict[x]
    else:
        return district_num_dict[x]


sen_pred_df = cleaned_val_senate.copy()
sen_pred_df['Probability_Female'] = 0.0
sen_pred_df['Probability_Female_Dem'] = 0.0

sen_pred_df = sen_pred_df[['District','Probability_Female', 'Probability_Female_Dem']]
for i, row in sen_pred_df.iterrows():
    sen_pred_df.set_value(i, 'Probability_Female', senate_val_lr_female_preds[i])
    sen_pred_df.set_value(i, 'Probability_Female_Dem', senate_val_lr_femaleDem_preds[i])


sen_pred_df.District = sen_pred_df.District.apply(lambda x: replace_district(x, 'senate'))

sen_pred_df = sen_pred_df.groupby('District').mean().reset_index()


house_pred_df = cleaned_val_house.copy()
house_pred_df['Probability_Female'] = 0.0
house_pred_df['Probability_Female_Dem'] = 0.0

house_pred_df = house_pred_df[['District','Probability_Female', 'Probability_Female_Dem']]
for i, row in house_pred_df.iterrows():
    house_pred_df.set_value(i, 'Probability_Female', house_val_lr_female_preds[i])
    house_pred_df.set_value(i, 'Probability_Female_Dem', house_val_lr_femaleDem_preds[i])

house_pred_df.District = house_pred_df.District.apply(lambda x: replace_district(x, 'house'))

house_pred_df = house_pred_df.groupby('District').mean().reset_index()

In [593]:
sen_pred_df.to_csv('predictions/senate_pred.csv', index=False)
house_pred_df.to_csv('predictions/house_pred.csv', index=False)