# Modeling – Passed, Pending, Archived Bills Using 'Party' Feature Only

## Importing Python Packages

In [1]:
# For loading, manipulating dataframe.
import pandas as pd
import sqlite3, datetime, sklearn

# Visualizations
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# Text Preprocessing
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

# Sampling
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler 

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Feature Processing 
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

# Modeling
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

# Hide Warning messages
import warnings
warnings.filterwarnings('ignore')

In [2]:
con = sqlite3.connect("phBills.db")
df_bills = pd.read_sql_query("SELECT * from senateBills", con, index_col = None, parse_dates = ['date_filed','date_lastUpdate'])
con.close()

In [3]:
df_senators = pd.read_excel('Bills with Authors.xlsx',sheet_name='Sheet1')

In [4]:
df_bills['passed']=np.where(df_bills.ra!='',True, False)
df_bills['long_title']=df_bills['long_title'].str.lower()
df_bills = df_bills[df_bills['congress']!=18]

In [5]:
df_bills.drop(['link','short_title','subject','pri_committee','ra','logs'],axis=1,inplace=True)

In [6]:
df_senators=df_senators[['bill_id','Full Name Primary Author','Party','Bloc','Years of Service']]

In [7]:
df_congress = pd.merge(df_bills,df_senators,how='left',on=['bill_id'])
print(f"Congress DF Data Points: {df_congress.shape[0]:,}")

Congress DF Data Points: 15,131


In [8]:
df_congress=df_congress[df_congress.status!='Withdrawn']

In [9]:
df_congress.loc[df_congress['Full Name Primary Author']=='Mannuel Villar Jr','Years of Service'] = df_congress.date_filed.apply(
                                                        lambda x: x.year - 2001)
df_congress.loc[df_congress['Full Name Primary Author']=='Mannuel Villar Jr','Party'] = 'Independent'
df_congress.loc[df_congress['Full Name Primary Author']=='Mannuel Villar Jr','Bloc'] = 'Majority'

In [10]:
df_congress['num_authors']=((df_congress.author.str.count(',') + 1)/2).astype('int')

In [11]:
df_congress['delta_days']=df_congress.date_lastUpdate-df_congress.date_filed

In [12]:
df_congress['delta_days']=df_congress['delta_days'].astype('timedelta64[D]')

In [13]:
print(f"median: {df_congress[df_congress['passed']==True]['delta_days'].median()}")
print(f"mean: {df_congress[df_congress['passed']==True]['delta_days'].mean()}")
print(f"mode: {df_congress[df_congress['passed']==True]['delta_days'].mode()[0]}")

median: 306.5
mean: 374.64611872146116
mode: 546.0


In [14]:
bound = df_congress[df_congress['passed']==True]['delta_days'].mean()
df_congress['upper']=np.where(df_congress.delta_days>bound,True,False)

In [15]:
df_congress['mon']=df_congress.date_filed.apply(lambda x: x.month)

In [16]:
df_congress.loc[(df_congress.mon.isin([1,2,3])), 'quarter']= 1
df_congress.loc[df_congress.mon.isin([4,5,6]), 'quarter']= 2
df_congress.loc[df_congress.mon.isin([7,8,9]), 'quarter']= 3
df_congress.loc[df_congress.mon.isin([10,11,12]), 'quarter']= 4

In [17]:
df_congress['scope_national']=np.where(df_congress.scope=='National',True,False)

In [18]:
df_congress['majority_bloc']=np.where(df_congress['Bloc']=='Majority',True,False)

In [19]:
df_congress['len_desc']=df_congress.long_title.str.len()

In [20]:
print(f"df_congress Data Points: {df_congress.shape[0]:,}")

df_congress Data Points: 15,078


In [21]:
df_congress.isna().sum().any()

False

## One-Hot Encoding on 'Party' Feature

In [22]:
ohe = OneHotEncoder(sparse=False)

In [23]:
X = ohe.fit_transform(df_congress[['Party']])

In [24]:
X.shape

(15078, 15)

In [25]:
df_congress.shape

(15078, 22)

In [26]:
ohe.categories_

[array(['Akbayan', 'Bagumbayan-VNP', 'Independent', 'LDP', 'Lakas',
        'Lakas-CMD', 'Lakas-Kampi-CMD', 'Liberal', 'NPC', 'Nacionalista',
        'PDP-Laban', 'PDP–Laban', 'PMP', 'PRP', 'UNA'], dtype=object)]

## Prediction Using Hot-Encoded 'Party' field Only

In [27]:
log_reg = LogisticRegression(solver='lbfgs')

In [28]:
y = df_congress['passed']
y.shape

(15078,)

In [29]:
cross_val_score(log_reg, X, y, cv=5, scoring='accuracy').mean()

0.9709510801430513

In [30]:
y.value_counts(normalize=True)

False    0.970951
True     0.029049
Name: passed, dtype: float64

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=43)

In [32]:
rus = RandomUnderSampler(random_state=42)
nx_train, ny_train = rus.fit_resample(X_train, y_train)
print(f"Our new train set has {ny_train.shape[0]} data points, {np.sum(ny_train)} of which are actual positives.")

Our new train set has 642 data points, 321 of which are actual positives.


In [33]:
svc = SVC(gamma='auto', kernel='linear', random_state = 42)
svc.fit(nx_train, ny_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [34]:
predicted = svc.predict(X_test)

## Evaluation

In [35]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

       False       0.99      0.48      0.65      3653
        True       0.05      0.84      0.09       117

    accuracy                           0.49      3770
   macro avg       0.52      0.66      0.37      3770
weighted avg       0.96      0.49      0.63      3770

