In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm
from sklearn.feature_selection import f_classif

In [2]:
df = pd.read_csv("./Data/final_dummies_add.csv")
df

Unnamed: 0,bill_id,title,sponsor_title,sponsor_name,sponsor_state,introduced_date,cosponsors,cosponsors_by_party,committees,summary,...,clean_committees_Rules and Administration,clean_committees_Science,clean_committees_Science and Technology,"clean_committees_Science, Space, and Technology",clean_committees_Small Business,clean_committees_Small Business and Entrepreneurship,clean_committees_Standards of Official Conduct,clean_committees_Transportation and Infrastructure,clean_committees_Ways and Means,law_Law
0,s1-115,An original bill to provide for reconciliation...,Sen.,Michael B. Enzi,WY,2017-11-28,0,{},['Budget'],(This measure has not been amended since it wa...,...,0,0,0,0,0,0,0,0,0,0
1,sconres3-115,A concurrent resolution setting forth the cong...,Sen.,Michael B. Enzi,WY,2017-01-03,0,{},['Budget'],(This measure has not been amended since it wa...,...,0,0,0,0,0,0,0,0,0,0
2,s512-115,A bill to modernize the regulation of nuclear ...,Sen.,John Barrasso,WY,2017-03-02,18,"{'D': 8, 'R': 10}",['Environment and Public Works'],Nuclear Energy Innovation and Modernization Ac...,...,0,0,0,0,0,0,0,0,0,1
3,hres694-115,Providing for consideration of the bill (H.R. ...,Rep.,Liz Cheney,WY,2018-01-16,0,{},['Rules'],Sets forth the rule for consideration of the b...,...,0,0,0,0,0,0,0,0,0,0
4,s1250-115,A bill to amend the Indian Health Care Improve...,Sen.,John Barrasso,WY,2017-05-25,3,{'R': 3},['Indian Affairs'],Restoring Accountability in the Indian Health ...,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112495,hr1702-106,"To amend title 18, United States Code, to ban ...",Rep.,Patsy T. Mink,HI,1999-05-05,13,{'D': 13},['Judiciary'],Amends the Federal criminal code to make it un...,...,0,0,0,0,0,0,0,0,0,0
112496,hr1748-106,"To amend title 5, United States Code, to incre...",Rep.,Patsy T. Mink,HI,1999-05-11,2,{'D': 2},['Government Reform'],Amends Federal retirement provisions to increa...,...,0,0,0,0,0,0,0,0,0,0
112497,sres32-106,A resolution to express the sense of the Senat...,,Daniel K. Inouye,HI,1999-02-04,0,{},"['Commerce, Science, and Transportation']",,...,0,0,0,0,0,0,0,0,0,0
112498,hr3451-106,To amend the Internal Revenue Code of 1986 to ...,,Neil Abercrombie,HI,1999-11-18,0,{},['Ways and Means'],,...,0,0,0,0,0,0,0,0,1,0


In [3]:
df.final_status_coms = df.final_status_coms.map(lambda x: 1 if x == "Action taken on bill" else 0)

In [4]:
corr = df.corr()

In [5]:
# Correlation with Law

In [6]:
corr['law_Law'].sort_values(ascending = False)

law_Law                                                    1.000000
final_status_coms                                          0.411012
clean_committees_Multiple                                  0.177496
summary_length                                             0.118123
bipartisan                                                 0.082906
                                                             ...   
primary_subject_Foreign trade and international finance   -0.041930
primary_subject_Taxation                                  -0.046980
clean_committees_Ways and Means                           -0.050848
bill_type_hres                                            -0.055703
clean_committees_Finance                                  -0.056812
Name: law_Law, Length: 106, dtype: float64

In [7]:
# Correlation with getting out of committee

In [32]:
corr['final_status_coms'].sort_values(ascending = False).tail(10)

bill_type_sres                                            -0.056951
region_Northeast                                          -0.057055
bill_type_hr                                              -0.062368
clean_committees_Judiciary                                -0.064513
clean_committees_Energy and Commerce                      -0.066617
primary_subject_Health                                    -0.081773
primary_subject_Foreign trade and international finance   -0.107087
primary_subject_Taxation                                  -0.107896
clean_committees_Finance                                  -0.121490
clean_committees_Ways and Means                           -0.138252
Name: final_status_coms, dtype: float64

In [9]:
top_features = corr[abs(corr['final_status_coms']) >=.05].drop(["law_Law", "final_status_coms"], axis = 1).index

In [10]:
top_features

Index(['cosponsors', 'final_status_coms', 'sponsor_party_rank', 'coms_match',
       'bipartisan', 'summary_length', 'sponsor_majority',
       'sponsor_pres_party_match', 'bill_type_hr', 'bill_type_sres',
       'sponsor_party_R', 'region_Northeast', 'region_West',
       'primary_subject_Commemorations', 'primary_subject_Congress',
       'primary_subject_Education',
       'primary_subject_Foreign trade and international finance',
       'primary_subject_Health',
       'primary_subject_Public Lands and Natural Resources',
       'primary_subject_Public lands and natural resources',
       'primary_subject_Taxation', 'clean_committees_Appropriations',
       'clean_committees_Energy and Commerce',
       'clean_committees_Energy and Natural Resources',
       'clean_committees_Finance',
       'clean_committees_Homeland Security and Governmental Affairs',
       'clean_committees_Indian Affairs', 'clean_committees_Judiciary',
       'clean_committees_Multiple', 'clean_committees_Rul

In [11]:
#features with top correlation

In [12]:
top_features = corr[abs(corr['final_status_coms']) >=.05].drop(["law_Law", "final_status_coms"]).T.columns

In [13]:
top_features

Index(['cosponsors', 'sponsor_party_rank', 'coms_match', 'bipartisan',
       'summary_length', 'sponsor_majority', 'sponsor_pres_party_match',
       'bill_type_hr', 'bill_type_sres', 'sponsor_party_R', 'region_Northeast',
       'region_West', 'primary_subject_Commemorations',
       'primary_subject_Congress', 'primary_subject_Education',
       'primary_subject_Foreign trade and international finance',
       'primary_subject_Health',
       'primary_subject_Public Lands and Natural Resources',
       'primary_subject_Public lands and natural resources',
       'primary_subject_Taxation', 'clean_committees_Appropriations',
       'clean_committees_Energy and Commerce',
       'clean_committees_Energy and Natural Resources',
       'clean_committees_Finance',
       'clean_committees_Homeland Security and Governmental Affairs',
       'clean_committees_Indian Affairs', 'clean_committees_Judiciary',
       'clean_committees_Multiple', 'clean_committees_Rules',
       'clean_committee

# Different distributions?

In [14]:
from scipy.stats import ks_2samp
#is the distribution of bipartisan laws different from the distribution of non bipartisan

# Using f_classif to see p-values

In [15]:
x = df.select_dtypes(int, float).drop(columns = ["law_Law", "final_status_coms"], axis = 1)
y = df.law_Law

In [16]:
import pprint as pp

In [17]:
values = (f_classif(x, y))

In [18]:
values

(array([2.26169909e+02, 1.35867886e+02, 5.39005183e+01, 8.90216622e+01,
        5.46524357e+01, 1.59191446e+03, 6.69817644e+02, 5.40980659e+01,
        2.34717447e+01, 2.64540498e+02, 2.50867138e+02, 3.50154058e+02,
        6.17371788e+00, 3.04203531e+01, 7.51149204e+01, 1.08261890e+02,
        7.96374574e+00, 2.55315542e+00, 4.75784292e+02, 7.56486528e+01,
        1.08736468e+01, 1.23291298e+01, 2.24894840e-01, 5.77524110e+00,
        7.16324584e+02, 3.30371485e-04, 8.84161472e+01, 6.91988977e-02,
        2.93219308e-01, 7.53870395e+01, 1.40647358e+01, 1.36829042e+01,
        1.03343573e+01, 3.01132594e+00, 1.98132753e+02, 4.49967414e+02,
        2.56078381e+02, 1.17997226e+02, 7.04172048e+00, 3.39176231e+00,
        2.22258171e+01, 1.99621150e+01, 8.46482996e-01, 8.44941399e+01,
        2.59801326e+00, 1.80084739e+02, 2.48841715e+02, 2.48085748e+00,
        7.19568020e-02, 1.33663345e+01, 1.65828429e+01, 5.05495244e+02,
        5.73884790e+01, 6.18966249e-01, 1.12115336e+01, 7.324330

In [19]:
f_values = pd.DataFrame(values).T
f_values

Unnamed: 0,0,1
0,226.169909,4.574979e-51
1,135.867886,2.223148e-31
2,53.900518,2.123181e-13
3,89.021662,3.976038e-21
4,54.652436,1.448385e-13
...,...,...
98,5.047054,2.466978e-02
99,0.851079,3.562492e-01
100,0.683697,4.083184e-01
101,2.635952,1.044722e-01


In [20]:
f_values.rename(columns = {0:"fvalue", 1:"pvalue" }, inplace = True)

In [21]:
f_values

Unnamed: 0,fvalue,pvalue
0,226.169909,4.574979e-51
1,135.867886,2.223148e-31
2,53.900518,2.123181e-13
3,89.021662,3.976038e-21
4,54.652436,1.448385e-13
...,...,...
98,5.047054,2.466978e-02
99,0.851079,3.562492e-01
100,0.683697,4.083184e-01
101,2.635952,1.044722e-01


In [22]:
f_values[f_values["pvalue"]<=.05]

Unnamed: 0,fvalue,pvalue
0,226.169909,4.574979e-51
1,135.867886,2.223148e-31
2,53.900518,2.123181e-13
3,89.021662,3.976038e-21
4,54.652436,1.448385e-13
...,...,...
94,12.276423,4.588947e-04
96,5.512361,1.888427e-02
97,3.880202,4.886112e-02
98,5.047054,2.466978e-02


# Categorical Correlation

In [23]:
import scipy.stats as ss

In [24]:
confusion_matrix = pd.crosstab(df["sponsor_state"], df["committees"])

In [25]:
#Cramer's V

In [26]:
def cramers_stat(confusion_matrix):
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    return np.sqrt(chi2 / (n*(min(confusion_matrix.shape)-1)))

result = cramers_stat(confusion_matrix)

In [27]:
result

committees
['  Administration', '  Agriculture', '  Appropriations', '  Budget', '  Education and the Workforce', '  Energy and Commerce', '  Financial Services', '  Judiciary', '  Natural Resources', '  Rules', '  Science, Space, and Technology', '  Transportation and Infrastructure', '  Ways and Means', 'Oversight and Government Reform']       62.058897
['  Administration', '  Agriculture', '  Appropriations', '  Education and the Workforce', '  Energy and Commerce', '  Judiciary', '  Natural Resources', '  Oversight and Government Reform', '  Rules', '  Ways and Means', 'Budget']                                                                                                          62.058897
['  Administration', '  Agriculture', '  Armed Services', '  Education and the Workforce', '  Energy and Commerce', '  Financial Services', '  Foreign Affairs', '  Natural Resources', '  Oversight and Government Reform', '  Science, Space, and Technology', '  Transportation and Infrastructure',

In [28]:
df.columns[40:70]

Index(['region_West', 'primary_subject_Armed Forces and National Security',
       'primary_subject_Armed forces and national security',
       'primary_subject_Commemorations', 'primary_subject_Commerce',
       'primary_subject_Congress', 'primary_subject_Crime and Law Enforcement',
       'primary_subject_Crime and law enforcement',
       'primary_subject_Education', 'primary_subject_Energy',
       'primary_subject_Environmental protection',
       'primary_subject_Finance and Financial Sector',
       'primary_subject_Finance and financial sector',
       'primary_subject_Foreign trade and international finance',
       'primary_subject_Government Operations and Politics',
       'primary_subject_Government operations and politics',
       'primary_subject_Health', 'primary_subject_Immigration',
       'primary_subject_International Affairs',
       'primary_subject_International affairs',
       'primary_subject_Labor and employment', 'primary_subject_Law',
       'primary_subje

In [29]:
confusion_matrix = pd.crosstab( df["region_South"], df["primary_subject_Armed forces and national security"])

def cramers_stat(confusion_matrix):
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    return np.sqrt(chi2 / (n*(min(confusion_matrix.shape)-1)))

result = cramers_stat(confusion_matrix)
result

primary_subject_Armed forces and national security
0    0.005393
1    0.026553
dtype: float64

In [30]:
confusion_matrix = pd.crosstab(df["region_Northeast"], df["primary_subject_International affairs"])

def cramers_stat(confusion_matrix):
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    return np.sqrt(chi2 / (n*(min(confusion_matrix.shape)-1)))

result = cramers_stat(confusion_matrix)
result

primary_subject_International affairs
0    0.030355
1    0.150885
dtype: float64