# Modeling – Passed and Archived Bills Only Using 'Party' Feature

## Importing Python Packages

In [2]:
# For loading, manipulating dataframe.
import pandas as pd
import sqlite3, datetime, sklearn

# Visualizations
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# Text Preprocessing
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

# Sampling
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler 

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Feature Processing 
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

# Modeling
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

# Hide Warning messages
import warnings
warnings.filterwarnings('ignore')

In [3]:
con = sqlite3.connect("phBills.db")
df_bills = pd.read_sql_query("SELECT * from senateBills", con, index_col = None, parse_dates = ['date_filed','date_lastUpdate'])
con.close()

In [4]:
df_senators = pd.read_excel('Bills with Authors.xlsx',sheet_name='Sheet1')

In [5]:
df_bills['passed']=np.where(df_bills.ra!='',True, False)
df_bills['long_title']=df_bills['long_title'].str.lower()
df_bills = df_bills[df_bills['congress']!=18]

In [6]:
df_bills.drop(['link','short_title','subject','pri_committee','ra','logs'],axis=1,inplace=True)

In [7]:
df_senators=df_senators[['bill_id','Full Name Primary Author','Party','Bloc','Years of Service']]

In [8]:
df_congress = pd.merge(df_bills,df_senators,how='left',on=['bill_id'])
print(f"Congress DF Data Points: {df_congress.shape[0]:,}")

Congress DF Data Points: 15,131


In [9]:
df_congress=df_congress[df_congress.status!='Withdrawn']

In [10]:
df_congress.loc[df_congress['Full Name Primary Author']=='Mannuel Villar Jr','Years of Service'] = df_congress.date_filed.apply(
                                                        lambda x: x.year - 2001)
df_congress.loc[df_congress['Full Name Primary Author']=='Mannuel Villar Jr','Party'] = 'Independent'
df_congress.loc[df_congress['Full Name Primary Author']=='Mannuel Villar Jr','Bloc'] = 'Majority'

In [11]:
df_congress['num_authors']=((df_congress.author.str.count(',') + 1)/2).astype('int')

In [12]:
df_congress['delta_days']=df_congress.date_lastUpdate-df_congress.date_filed

In [13]:
df_congress['delta_days']=df_congress['delta_days'].astype('timedelta64[D]')

In [14]:
print(f"median: {df_congress[df_congress['passed']==True]['delta_days'].median()}")
print(f"mean: {df_congress[df_congress['passed']==True]['delta_days'].mean()}")
print(f"mode: {df_congress[df_congress['passed']==True]['delta_days'].mode()[0]}")

median: 306.5
mean: 374.64611872146116
mode: 546.0


In [15]:
bound = df_congress[df_congress['passed']==True]['delta_days'].mean()
df_congress['upper']=np.where(df_congress.delta_days>bound,True,False)

In [16]:
df_congress['mon']=df_congress.date_filed.apply(lambda x: x.month)

In [17]:
df_congress.loc[(df_congress.mon.isin([1,2,3])), 'quarter']= 1
df_congress.loc[df_congress.mon.isin([4,5,6]), 'quarter']= 2
df_congress.loc[df_congress.mon.isin([7,8,9]), 'quarter']= 3
df_congress.loc[df_congress.mon.isin([10,11,12]), 'quarter']= 4

In [18]:
df_congress['scope_national']=np.where(df_congress.scope=='National',True,False)

In [19]:
df_congress['majority_bloc']=np.where(df_congress['Bloc']=='Majority',True,False)

In [20]:
df_congress['len_desc']=df_congress.long_title.str.len()

In [21]:
df_congress=df_congress[(df_congress.status=='Sent to the Archives') | (df_congress.passed==True)]
print(f"df_congress Data Points: {df_congress.shape[0]:,}")

df_congress Data Points: 529


In [22]:
df_congress.isna().sum().any()

False

## One-Hot Encoding on 'Party' Feature

In [23]:
ohe = OneHotEncoder(sparse=False)

In [24]:
X = ohe.fit_transform(df_congress[['Party']])

In [25]:
X.shape

(529, 15)

In [26]:
df_congress.shape

(529, 22)

In [27]:
ohe.categories_

[array(['Akbayan', 'Bagumbayan-VNP', 'Independent', 'LDP', 'Lakas',
        'Lakas-CMD', 'Lakas-Kampi-CMD', 'Liberal', 'NPC', 'Nacionalista',
        'PDP-Laban', 'PDP–Laban', 'PMP', 'PRP', 'UNA'], dtype=object)]

## Prediction Using Hot-Encoded 'Party' field Only

In [28]:
log_reg = LogisticRegression(solver='lbfgs')

In [29]:
y = df_congress['passed']
y.shape

(529,)

In [30]:
cross_val_score(log_reg, X, y, cv=5, scoring='accuracy').mean()

0.9127915627545322

In [31]:
y.value_counts()

True     438
False     91
Name: passed, dtype: int64

In [32]:
y.value_counts(normalize=True)

True     0.827977
False    0.172023
Name: passed, dtype: float64

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=43)

In [34]:
print(f"Our new train set has {X_train.shape[0]} data points, {np.sum(y_train)} of which are actual positives.")

Our new train set has 396 data points, 331 of which are actual positives.


In [35]:
svc = SVC(gamma='auto', kernel='linear', random_state = 42)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [36]:
predicted = svc.predict(X_test)

## Evaluation

In [37]:
print(classification_report(y_train, svc.predict(X_train)))

              precision    recall  f1-score   support

       False       0.75      0.58      0.66        65
        True       0.92      0.96      0.94       331

    accuracy                           0.90       396
   macro avg       0.83      0.77      0.80       396
weighted avg       0.89      0.90      0.89       396



In [38]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

       False       0.92      0.85      0.88        26
        True       0.96      0.98      0.97       107

    accuracy                           0.95       133
   macro avg       0.94      0.91      0.93       133
weighted avg       0.95      0.95      0.95       133

