# Exploratory Data Analytics on Philippine Bills from 13th Congress to 17th Congress

## Importing Python Packages

In [65]:
# For loading, manipulating dataframe.
import pandas as pd
import sqlite3, datetime, re

# For Text Preprocessing
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Visualizations
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# Modeling
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Hide Warning messages
import warnings
warnings.filterwarnings('ignore')

In [2]:
con = sqlite3.connect("phBills.db")
df_bills = pd.read_sql_query("SELECT * from senateBills", con, index_col = None, parse_dates = ['date_filed','date_lastUpdate'])
con.close()

In [3]:
df_senators = pd.read_excel('Bills with Authors.xlsx',sheet_name='Sheet1')

In [4]:
df_bills['passed']=np.where(df_bills.ra!='',True, False)
df_bills['long_title']=df_bills['long_title'].str.lower()
df_bills = df_bills[df_bills['congress']!=18]

In [5]:
df_bills.status.value_counts()

Pending in the Committee                            11722
Consolidated/Substituted in the Committee Report     2337
Approved by the President of the Philippines          329
Pending Second Reading, Special Order                 241
Consolidated with Approved Bill                        97
Pending in the House of Representatives                95
Sent to the Archives                                   91
Pending Second Reading, Ordinary Business              86
Withdrawn                                              53
Lapsed Into Law                                        27
Pending Conference Committee                           17
Approved on Third Reading by the Senate                 9
Conference Committee Report Approved by Senate          7
Vetoed by the President of the Philippines              7
Pending First Reading                                   4
Approved on  Second Reading, with Amendments            3
Committee Report Sent to the Archives                   2
Passed by Both

In [6]:
df_bills.drop(['link','short_title','subject','pri_committee','ra','logs'],axis=1,inplace=True)

In [7]:
df_bills.isna().any()

bill_id            False
num                False
congress           False
long_title         False
date_filed         False
scope              False
status             False
author             False
date_lastUpdate    False
passed             False
dtype: bool

In [8]:
df_senators=df_senators[['bill_id','Full Name Primary Author','Party','Bloc','Years of Service']]

In [9]:
df_senators.head(2)

Unnamed: 0,bill_id,Full Name Primary Author,Party,Bloc,Years of Service
0,17SBN-2235,Franklin Drilon,Liberal,Minority,24.0
1,17SBN-2234,Ralph Recto,Nacionalista,Majority,18.0


In [10]:
print(f"Bills Data Points: {df_bills.shape[0]:,}\nSenators Data Points: {df_senators.shape[0]:,}")

Bills Data Points: 15,131
Senators Data Points: 15,131


In [11]:
df_congress = pd.merge(df_bills,df_senators,how='left',on=['bill_id'])
print(f"Congress DF Data Points: {df_congress.shape[0]:,}")

Congress DF Data Points: 15,131


In [12]:
df_congress=df_congress[df_congress.status!='Withdrawn']

In [13]:
df_congress.isna().sum()

bill_id                        0
num                            0
congress                       0
long_title                     0
date_filed                     0
scope                          0
status                         0
author                         0
date_lastUpdate                0
passed                         0
Full Name Primary Author       0
Party                          1
Bloc                           1
Years of Service            1334
dtype: int64

In [14]:
df_congress.loc[df_congress['Full Name Primary Author']=='Mannuel Villar Jr','Years of Service'] = df_congress.date_filed.apply(
                                                        lambda x: x.year - 2001)
df_congress.loc[df_congress['Full Name Primary Author']=='Mannuel Villar Jr','Party'] = 'Independent'
df_congress.loc[df_congress['Full Name Primary Author']=='Mannuel Villar Jr','Bloc'] = 'Majority'

In [15]:
df_congress.isna().sum()

bill_id                     0
num                         0
congress                    0
long_title                  0
date_filed                  0
scope                       0
status                      0
author                      0
date_lastUpdate             0
passed                      0
Full Name Primary Author    0
Party                       0
Bloc                        0
Years of Service            0
dtype: int64

In [16]:
df_congress.Party.value_counts(dropna=False)

PRP                2998
Independent        2934
PMP                2400
Nacionalista       1541
Liberal            1188
NPC                1023
LDP                 627
PDP-Laban           503
Lakas               436
Lakas-Kampi-CMD     404
Lakas-CMD           351
UNA                 281
PDP–Laban           225
Bagumbayan-VNP      104
Akbayan              63
Name: Party, dtype: int64

In [17]:
df_congress.Bloc.value_counts(dropna=False)

Majority    10268
Minority     4810
Name: Bloc, dtype: int64

In [18]:
print(f"df_congress Data Points: {df_congress.shape[0]:,}")

df_congress Data Points: 15,078


In [19]:
df_congress=df_congress[(df_congress.status=='Sent to the Archives') | (df_congress.passed==True)]
print(f"New df_congress Data Points: {df_congress.shape[0]:,}")

New df_congress Data Points: 529


In [20]:
df_congress['num_authors']=((df_congress.author.str.count(',') + 1)/2).astype('int')

In [21]:
df_congress['delta_days']=df_congress.date_lastUpdate-df_congress.date_filed

In [22]:
df_congress['delta_days']=df_congress['delta_days'].astype('timedelta64[D]')

In [23]:
print(f"median: {df_congress[df_congress['passed']==True]['delta_days'].median()}")
print(f"mean: {df_congress[df_congress['passed']==True]['delta_days'].mean()}")
print(f"mode: {df_congress[df_congress['passed']==True]['delta_days'].mode()[0]}")

median: 306.5
mean: 374.64611872146116
mode: 546.0


In [24]:
bound = df_congress[df_congress['passed']==True]['delta_days'].mean()
df_congress['upper']=np.where(df_congress.delta_days>bound,True,False)

In [25]:
df_congress['mon']=df_congress.date_filed.apply(lambda x: x.month)

In [26]:
df_congress.loc[(df_congress.mon.isin([1,2,3])), 'quarter']= 1
df_congress.loc[df_congress.mon.isin([4,5,6]), 'quarter']= 2
df_congress.loc[df_congress.mon.isin([7,8,9]), 'quarter']= 3
df_congress.loc[df_congress.mon.isin([10,11,12]), 'quarter']= 4

In [27]:
df_congress[df_congress['passed']==True].mon.value_counts(sort=False)

1     40
2     34
3     30
4      4
5     53
6     34
7     33
8     26
9     44
10    30
11    70
12    40
Name: mon, dtype: int64

In [28]:
df_congress[df_congress['passed']==True].quarter.value_counts(sort=False)

2.0     91
1.0    104
4.0    140
3.0    103
Name: quarter, dtype: int64

In [29]:
df_congress.scope.value_counts()

National    511
Local        18
Name: scope, dtype: int64

In [30]:
df_congress['scope_national']=np.where(df_congress.scope=='National',True,False)

In [31]:
df_congress.scope_national.value_counts()

True     511
False     18
Name: scope_national, dtype: int64

In [32]:
df_congress[df_congress['passed']==True].Bloc.value_counts(dropna=False)

Majority    353
Minority     85
Name: Bloc, dtype: int64

In [33]:
df_congress[df_congress['passed']==False].Bloc.value_counts(dropna=False)

Majority    75
Minority    16
Name: Bloc, dtype: int64

In [34]:
df_congress['majority_bloc']=np.where(df_congress['Bloc']=='Majority',True,False)

In [35]:
df_congress.dtypes

bill_id                             object
num                                 object
congress                             int64
long_title                          object
date_filed                  datetime64[ns]
scope                               object
status                              object
author                              object
date_lastUpdate             datetime64[ns]
passed                                bool
Full Name Primary Author            object
Party                               object
Bloc                                object
Years of Service                   float64
num_authors                          int64
delta_days                         float64
upper                                 bool
mon                                  int64
quarter                            float64
scope_national                        bool
majority_bloc                         bool
dtype: object

In [40]:
df_congress['len_desc']=df_congress.long_title.str.len()

In [48]:
df_congress.head(2)

Unnamed: 0,bill_id,num,congress,long_title,date_filed,scope,status,author,date_lastUpdate,passed,...,Bloc,Years of Service,num_authors,delta_days,upper,mon,quarter,scope_national,majority_bloc,len_desc
1,17SBN-2234,SBN-2234,17,an act authorizing the sale of certain parcels...,2019-05-27,National,Approved by the President of the Philippines,"Recto, Ralph G., Sotto III, Vicente C., Escude...",2019-06-30,True,...,Majority,18.0,3,34.0,False,5,2.0,True,True,300
2,17SBN-2233,SBN-2233,17,an act increasing the excise tax on tobacco pr...,2019-05-27,National,Consolidated with Approved Bill,"Pacquiao, Emmanuel ""Manny"" D., Ejercito, Josep...",2019-06-30,True,...,Majority,3.0,6,34.0,False,5,2.0,True,True,426


In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [50]:
ohe = OneHotEncoder(sparse=False)

In [56]:
X = ohe.fit_transform(df_congress[['Party']])

In [57]:
X.shape

(529, 15)

In [58]:
df_congress.shape

(529, 22)

In [52]:
ohe.categories_

[array(['Akbayan', 'Bagumbayan-VNP', 'Independent', 'LDP', 'Lakas',
        'Lakas-CMD', 'Lakas-Kampi-CMD', 'Liberal', 'NPC', 'Nacionalista',
        'PDP-Laban', 'PDP–Laban', 'PMP', 'PRP', 'UNA'], dtype=object)]

In [54]:
log_reg = LogisticRegression(solver='lbfgs')

In [55]:
from sklearn.model_selection import cross_val_score

In [59]:
y = df_congress['passed']

In [60]:
cross_val_score(log_reg,X, y, cv=5, scoring='accuracy').mean()

0.9127915627545322

In [63]:
y.value_counts()

True     438
False     91
Name: passed, dtype: int64

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=43)

In [68]:
svc = SVC(gamma='auto', kernel='linear', random_state = 42)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [69]:
predicted = svc.predict(X_test)

In [70]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

       False       0.92      0.85      0.88        26
        True       0.96      0.98      0.97       107

    accuracy                           0.95       133
   macro avg       0.94      0.91      0.93       133
weighted avg       0.95      0.95      0.95       133

