In [1]:
import os
import zipfile
import requests
import pandas as pd

from sqlalchemy import create_engine
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [2]:
# import data from PGSQL
engine=create_engine('postgresql://all_user:password@database.cvvbordirasf.us-east-1.rds.amazonaws.com:5432/capstone')

table_names=engine.table_names()
print (table_names)


['team_member', 'candidate_summary_join', 'census_bus_employee', 'census_social_0617', 'committee_linkage_join', 'master_join', 'master_join1', 'fec_summary', 'fec_operating_expenditure', 'cq_race', 'cq_candidates_race', 'census_soc', 'individual_contribution_join_abbreviated', 'master_join2', 'candidate_join_abbreviated', 'fec_committee', 'fec_candidate', 'test_join', 'investigate', 'cross_walk', 'fec_candidate_committee_linkage', 'fec_individual_contribution', 'cq_race_incumbent', 'fec_committee_to_candidate_contribution', 'fec_committee_to_committee_contribution', 'fec_committee_2012', 'fec_table_join', 'candidate_join', 'fec_join', 'individual_contribution_join', 'test_join_win']


In [3]:
con=engine.connect()
rs1=con.execute('SELECT * FROM master_join2')

master_join2 = pd.DataFrame(rs1.fetchall())
df = pd.DataFrame(master_join2)
master_join2.columns = rs1.keys()

con.close()

In [4]:
# Transform data to numerical data 

import numpy as np

def handle_non_numerical_data(df):
    columns = df.columns.values
    
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype !=np.int64 and df[column].dtype !=np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x=0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1
                    
            df[column] = list(map(convert_to_int,df[column]))
            
    return df

df = handle_non_numerical_data(df)
print(df.head())

   race_uid  state  year  districtnumber  redistricteddate  totalvote  \
0       482     35     0               0                 1        486   
1       220     35     1               0                 1        420   
2       220     35     1               0                 1        420   
3       482     35     0               0                 1        486   
4       220     35     1               0                 1        420   

   winingplurality  winningparty  incumbentwin  race_uid2      ...       \
0              558             0             1        482      ...        
1              849             0             1        220      ...        
2              849             0             1        220      ...        
3              558             0             1        482      ...        
4              849             0             1        220      ...        

   other_loans  cand_loan_repay  other_loan_repay  debts_owed_by  \
0            4              185           

In [5]:
# define range for DataFrame
dataset=df[['incumbentstatus', 'candidateparty', 'medianage','white','black','asian','hispanicorlatino','unemploymentrate',
            'agriculture','manufacturing','finance','information','house_median','medianhouseholdincome','meanhouseholdincome','highschoolorhigher',
           'bachelororhigher','result']]

In [6]:
dataset=dataset.dropna()

In [7]:
dataset.head()

Unnamed: 0,incumbentstatus,candidateparty,medianage,white,black,asian,hispanicorlatino,unemploymentrate,agriculture,manufacturing,finance,information,house_median,medianhouseholdincome,meanhouseholdincome,highschoolorhigher,bachelororhigher,result
0,0,4,35,152,340,408,304,40,163,257,301,304,294,353,166,94,221,0
1,0,32,35,152,340,408,304,40,163,257,301,304,294,353,166,94,221,0
2,0,4,35,152,340,408,304,40,163,257,301,304,294,353,166,94,221,0
3,0,32,35,152,340,408,304,40,163,257,301,304,294,353,166,94,221,0
4,1,5,35,152,340,408,304,40,163,257,301,304,294,353,166,94,221,1


In [13]:
#Separate dataframe into features and targets
features = dataset[['incumbentstatus', 'candidateparty', 'medianage','white','black','asian','hispanicorlatino','unemploymentrate',
            'agriculture','manufacturing','finance','information','house_median','medianhouseholdincome','meanhouseholdincome','highschoolorhigher',
           'bachelororhigher']]
target   = dataset['result']

In [14]:
list(features)

['incumbentstatus',
 'candidateparty',
 'medianage',
 'white',
 'black',
 'asian',
 'hispanicorlatino',
 'unemploymentrate',
 'agriculture',
 'manufacturing',
 'finance',
 'information',
 'house_median',
 'medianhouseholdincome',
 'meanhouseholdincome',
 'highschoolorhigher',
 'bachelororhigher']

In [15]:
model = Lasso()
model.fit(features, target)
print(list(zip(features, model.coef_.tolist())))

[('incumbentstatus', 0.0), ('candidateparty', -0.0), ('medianage', -0.0), ('white', 0.0), ('black', 0.0), ('asian', -0.0), ('hispanicorlatino', 0.0), ('unemploymentrate', -0.0), ('agriculture', 0.0), ('manufacturing', -0.0), ('finance', -0.0), ('information', -0.0), ('house_median', -0.0), ('medianhouseholdincome', 0.0), ('meanhouseholdincome', 0.0), ('highschoolorhigher', -0.0), ('bachelororhigher', -0.0)]


In [16]:
model = Ridge()
model.fit(features, target)
print(list(zip(features, model.coef_.tolist())))

[('incumbentstatus', 0.8317846955697255), ('candidateparty', -0.0002772060417818232), ('medianage', -0.00015772424255546186), ('white', -4.821395453780255e-05), ('black', -9.62655399534681e-07), ('asian', 3.176256567310534e-05), ('hispanicorlatino', -1.0303239867055586e-05), ('unemploymentrate', -7.044010884138854e-05), ('agriculture', 2.400017536153644e-05), ('manufacturing', -3.2083849142785944e-06), ('finance', -2.2705894848254263e-05), ('information', -7.641972781548208e-05), ('house_median', 1.0077121186506848e-06), ('medianhouseholdincome', -2.487430173371346e-05), ('meanhouseholdincome', -2.917213019603983e-05), ('highschoolorhigher', -7.868159477628674e-05), ('bachelororhigher', -3.2878889960651736e-05)]


In [17]:
model = ElasticNet(l1_ratio=0.10)
model.fit(features, target)
print(list(zip(features, model.coef_.tolist())))

[('incumbentstatus', 0.08079335045543239), ('candidateparty', -0.0010560001119165323), ('medianage', -0.0), ('white', 7.724698338328954e-06), ('black', 5.2753600814912505e-05), ('asian', -3.3473648605532885e-05), ('hispanicorlatino', 1.0314389788809038e-05), ('unemploymentrate', -0.0), ('agriculture', 2.583461791826243e-05), ('manufacturing', -0.0), ('finance', -0.0), ('information', -3.704420298369707e-05), ('house_median', -7.617494173961689e-06), ('medianhouseholdincome', 4.1799555362359715e-05), ('meanhouseholdincome', 9.25507444771815e-06), ('highschoolorhigher', -0.0001266805990414452), ('bachelororhigher', -3.916980884598279e-05)]


In [21]:
model = Lasso()
sfm = SelectFromModel(model)
sfm.fit(features, target)
print(list(features[sfm.get_support(indices=True)]))

[]


In [22]:
model = Ridge()
sfm = SelectFromModel(model)
sfm.fit(features, target)
print(list(features[sfm.get_support(indices=True)]))

['incumbentstatus']


In [23]:
model = ElasticNet()
sfm = SelectFromModel(model)
sfm.fit(features, target)
print(list(features[sfm.get_support(indices=True)]))

['black', 'asian', 'information', 'medianhouseholdincome']


In [24]:
pca = PCA(n_components=2)
new_features = pca.fit(features).transform(features)
print(new_features)

[[-179.55100979  102.80043144]
 [-179.54437504  102.78777561]
 [-179.55100979  102.80043144]
 ..., 
 [-158.35599898  202.94382338]
 [-158.36263373  202.95647921]
 [-158.36242635  202.95597344]]


In [25]:
lda = LDA(n_components=2)
new_features = lda.fit(features, target).transform(features)
print(new_features)

[[-1.26915504]
 [-1.30183632]
 [-1.26915504]
 ..., 
 [-1.31058444]
 [-1.27790317]
 [ 2.26997834]]
