In [91]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from pandas.core import datetools
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN
from scipy import stats
from sklearn.model_selection import train_test_split as tt_split
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2
import csv

In [4]:
def read_dataset(path):
    data = pd.read_csv(path)
    return data

In [5]:
senate_name_dict = {}
house_name_dict = {}
name_dict = {}
def enumerate_districts(df, type=None):
    names = df['District']
    count = 0
    for name in names:
        if type == "Senate":
            if name not in senate_name_dict:
                senate_name_dict[name] = count
                count += 1
        elif type == "House":
            if name not in house_name_dict:
                house_name_dict[name] = count
                count += 1
        else:
            if name not in name_dict:
                name_dict[name] = count
                count += 1

In [6]:
def replace_district(x, type=None):
    if type == "Senate":
        return senate_name_dict[x]
    elif type == "House":
        return house_name_dict[x]
    else:
        return name_dict[x]

In [80]:
# this will be different than historical_model
def clean_data(df, type=None):
    # make all text binary
    df = df.drop("name", axis=1)
    df = df.drop("Unnamed: 0", axis=1)
    df['sex'].replace('f', 1, inplace=True)
    df['sex'].replace('m', 0, inplace=True)
    df['party'].replace('Democratic', 1, inplace=True)
    df['party'].replace('Republican', 0, inplace=True)
    
    # fill NaN's with mean from column
    df['sex'] = df['sex'].fillna(round(df['sex'].mean()))
    df['party'] = df['party'].fillna(df['party'].mean())
    df['Amount'] = df['Amount'].fillna(df['Amount'].mean())
    df['District'] = df["District"].apply(lambda x: replace_district(x, type))
    
    for col in df.columns:
        if col == "vote_count":
            continue
        if df[col].dtype != float and df[col].dtype != int:
            length = len(list(df[col]))
            temp_list = list(df[col])
            for i in range(length):
                
                if "," in str(temp_list[i]):
                    df[col] = df[col].apply(lambda x: str(x).replace(",", "").replace('nan', 'NaN')).astype(float)
    
    df['vote_count'] = df['vote_count'].apply(lambda x: str(x).replace(",", "").replace('nan', 'NaN')).astype(float)
    df['vote_count'] = df['vote_count'].fillna(df['vote_count'].mean())
    df['vote_percent'] = df['vote_percent'].fillna(df['vote_percent'].mean())
    #df['District'] = df["District"].apply(lambda x: replace_district(x, type))
    
    # add indicator for female democrat
    df['female_dem'] = 0
    for index, row in df.iterrows():
        if row.sex == 1 and row.party == 1:
            df.set_value(index, 'female_dem', 1)
            
    # remove "(percent) margin of error" columns
    df = df.iloc[:, [index for index, x in enumerate(df.columns) if 'Margin' not in x]]
    
    # remove columns with low percent contributions
    percent_cols = [col for index, col in enumerate(df.columns) if 'Percent' in col and df[col].mean() < 0.05]
    for col in percent_cols:
        df = df.drop(col, axis=1)
        df = df.drop(col.replace("Percent", "Estimate"), axis=1)
        
    return df

In [8]:
def independent_columns(A, tol =0): #= 1e-05):
    Q, R = np.linalg.qr(A)
    independent = np.where(np.abs(R.diagonal()) > tol)[0]
    
    return independent

In [81]:
def logistic_regression(train_df):
    
    
    loo = LeaveOneOut()
    y_train = train_df['female_dem']
    X_train = train_df.drop(['sex', 'female_dem', 'party'], axis=1)
    
    X_array = X_train.values
    y_array = y_train.values
    y_true = []
    y_pred_acc = []
    all_coefs = []
    
    logreg = LR()
    rfe = RFE(logreg, 30)
    rfe = rfe.fit(X_array, y_array)
    cool = rfe.support_
    newX = []
    newX_test = []
    features = []
    count = 0
    for col in X_train.columns:
        if cool[count]:
            features += [col]
        count+= 1
    for i in range(len(X_array)):
        temp = []
        for j in range(len(cool)):
            if cool[j]:
                temp += [X_array[i][j]]
        if len(temp)>0:
            newX += [temp]
    newX = np.array(newX)
    all_good_features = []
    sig_features_dict = {}
    for i in features:
        sig_features_dict[i] = 0
    for train_index, test_index in loo.split(newX):
        X_tr, X_te = newX[train_index], newX[test_index]
        y_tr, y_test = y_array[train_index], y_array[test_index]
        
        classifier = LR()
        
        
        chi, pval = chi2(X_tr, y_tr)
        count = 0
        good_features = []

        for i in range(len(pval)):
            if pval[i] < .05:
                count+=1
                sig_features_dict[features[i]] += pval[i]
        classifier.fit(X_tr, y_tr)
        pred = classifier.predict(X_te)
        
        y_true += [y_test[0]]
        y_pred_acc += [pred]

    for i in sig_features_dict:
        sig_features_dict[i] = sig_features_dict[i] / 27
    
#     significant_features = pd.DataFrame.from_dict(sig_features_dict)
    print("Test accuracy: {}".format(accuracy_score(y_true, y_pred_acc)))
    return sig_features_dict, accuracy_score(y_true, y_pred_acc)

In [105]:
def classify_glm(train_df):
    
    
    loo = LeaveOneOut()
    y_train = train_df['sex']
    X_train = train_df.drop('sex', axis=1)
    

    
    independent = independent_columns(X_train)
    X = X_train.iloc[:, independent]
    print("Rank is {}".format(X_train.shape[1]))
    
    X_array = X.values
    y_array = y_train.values
    y_true = []
    y_pred_acc = []
    

    for train_index, test_index in loo.split(X_array):
        X_tr, X_te = X_array[train_index], X_array[test_index]
        y_tr, y_test = y_array[train_index], y_array[test_index]

        classifier = sm.GLM(y_tr, X_tr)
        results = classifier.fit()
        print(results.summary())
        pred = round(results.predict(X_te)[0])
        y_true += [y_test[0]]
        y_pred_acc += [pred]
    return accuracy_score(y_true, y_pred_acc)

In [99]:
senate_data = read_dataset("VoteBuilder_and_all_data_Senate.csv")
enumerate_districts(senate_data, "Senate")
    
cleaned_senate = clean_data(senate_data,"Senate")

house_data = read_dataset("VoteBuilder_and_all_data_House.csv")
enumerate_districts(house_data, "House")
    
cleaned_house = clean_data(house_data,"House")


print("Senate Logistic Regression: ")
senate_features, accuracy = logistic_regression(cleaned_senate)
print(senate_features)
# print("OLS Generalized Linear Model: ")
# print(classify_glm(cleaned_senate))

print("House Logistic Regression: ")
house_features, house_accuracy = logistic_regression(cleaned_house)




Senate Logistic Regression: 
Test accuracy: 0.7777777777777778
{'UnEn_Caucasian_high': 0.0, 'Dem_Female': 0.0, 'Total_Voters': 0.0, 'Estimate; SEX AND AGE - Total population': 3.8917654425643231e-16, 'Percent; SEX AND AGE - Total population': 3.8917654425643231e-16, 'Estimate; SEX AND AGE - Total population - Female': 0.00028498324606574335, 'Estimate; SEX AND AGE - 5 to 9 years': 0.0022745215111249512, 'Estimate; SEX AND AGE - 20 to 24 years': 0.0, 'Estimate; SEX AND AGE - 35 to 44 years': 0.0022281353616797756, 'Estimate; SEX AND AGE - 55 to 59 years': 0.0066829445410983412, 'Estimate; SEX AND AGE - 18 years and over': 5.1780965900098242e-14, 'Estimate; SEX AND AGE - 62 years and over': 3.4825161695678315e-163, 'Estimate; SEX AND AGE - 65 years and over': 1.4941721723457067e-165, 'Estimate; SEX AND AGE - 18 years and over.1': 5.1780965900098242e-14, 'Percent; SEX AND AGE - 18 years and over.1': 5.1780965900098242e-14, 'Estimate; SEX AND AGE - 18 years and over - Male': 2.564148709555

In [106]:
# print("OLS Generalized Linear Model: ")
# print(classify_glm(cleaned_senate))

OLS Generalized Linear Model: 
Rank is 346
                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                   26
Model:                            GLM   Df Residuals:                        3
Model Family:                Gaussian   Df Model:                           22
Link Function:               identity   Scale:                 0.0509766701497
Method:                          IRLS   Log-Likelihood:                 29.874
Date:                Sun, 10 Dec 2017   Deviance:                      0.15293
Time:                        22:00:48   Pearson chi2:                    0.153
No. Iterations:                     4                                         
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.0313      0.032      0.965      0.334        -0.032     0.095
x2       

In [96]:
with open('senate_features.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in senate_features.items():
        writer.writerow([key, value])

In [97]:
with open('house_features.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in house_features.items():
        writer.writerow([key, value])