# Exploratory Data Analytics on Philippine Bills from 13th Congress to 17th Congress

## Importing Python Packages

In [1]:
# For loading, manipulating dataframe.
import pandas as pd
import sqlite3, datetime, re

# For Text Preprocessing
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Visualizations
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# Hide Warning messages
import warnings
warnings.filterwarnings('ignore')

In [2]:
con = sqlite3.connect("phBills.db")
df = pd.read_sql_query("SELECT * from senateBills", con, index_col = None, parse_dates = ['date_filed','date_lastUpdate'])
con.close()

In [3]:
df_senators = pd.read_excel('Bills with Authors.xlsx',sheet_name='Sheet2')

In [4]:
df['Passed']=np.where(df.ra!='',True, False)
df['long_title']=df['long_title'].str.lower()
df_13_17_all = df[df['congress']!=18]
df_num_rows, df_num_col = df.shape
print(f"There are {df_num_rows} records and {df_num_col} features from our original data frame.")
bill_passage_rate = (df_13_17_all['ra']!='').sum()/((df_13_17_all['ra']!='').sum() + (df_13_17_all['ra']=='').sum())
print(f"From 13th Congress to 17th Congress, the PH Bill Passage Percentage is {(bill_passage_rate*100):.2f}%")

There are 16284 records and 16 features from our original data frame.
From 13th Congress to 17th Congress, the PH Bill Passage Percentage is 2.89%


In [5]:
df_merged = pd.merge(df_13_17_all,df_senators[['bill_id','Bloc','Party']],how='left',on=['bill_id'])

In [6]:
df_merged[df_merged['Passed']==True].Party.value_counts()

Independent        97
Nacionalista       74
NPC                58
Liberal            58
PMP                40
LDP                34
PDP–Laban          16
PRP                15
Lakas-Kampi-CMD    12
Lakas               7
UNA                 7
PDP-Laban           6
Bagumbayan-VNP      6
Lakas-CMD           5
Akbayan             3
Name: Party, dtype: int64

In [7]:
df_merged=df_merged[df_merged['status']!='Withdrawn']

In [8]:
df_merged['num_authors']=((df_merged.author.str.count(',') + 1)/2).astype('int')

In [10]:
df_merged['lapsed_day']=df_merged.date_lastUpdate-df_merged.date_filed

In [11]:
df_merged.Bloc.fillna('Majority',inplace=True)

In [12]:
passed_ave = df_merged[df_merged['Passed']==True]['lapsed_day'].median()
df_merged['upper']=np.where(df_merged.lapsed_day>passed_ave,True,False)

In [13]:
df_merged[df_merged['Passed']==True]['lapsed_day'].median()

Timedelta('306 days 12:00:00')

In [14]:
df_merged[df_merged['Passed']==True]['lapsed_day'].mean()

Timedelta('374 days 15:30:24.657534')

In [15]:
df_merged[df_merged['Passed']==True]['lapsed_day'].mode()[0]

Timedelta('546 days 00:00:00')

In [16]:
df_merged['mon']=df_merged.date_filed.apply(lambda x: x.month)

In [17]:
df_merged[df_merged['Passed']==True].mon.value_counts()

11    70
5     53
9     44
12    40
1     40
6     34
2     34
7     33
10    30
3     30
8     26
4      4
Name: mon, dtype: int64

In [18]:
print(df_merged[df_merged['Passed']==True].scope.value_counts(dropna=False))
print(f"\nOnly {(((df_merged.Passed==True)&\
(df_merged.scope=='Local')).sum())/((df_merged.Passed==True).sum()):.2%} of local senate bills are passed.")

National    421
Local        17
Name: scope, dtype: int64

Only 3.88% of local senate bills are passed.


In [19]:
df_merged.Bloc.value_counts(dropna=False)

Majority    9860
Minority    5218
Name: Bloc, dtype: int64

In [20]:
df_merged[df_merged['Passed']==True].Bloc.value_counts(dropna=False)

Majority    351
Minority     87
Name: Bloc, dtype: int64

In [21]:
df_merged['majority']=np.where(df_merged['Bloc']=='Majority',True,False)

In [22]:
df_merged=df_merged[(df_merged.status=='Sent to the Archives') | (df_merged.Passed==True)]

In [23]:
cols_of_interest = ['num_authors','lapsed_day','upper','mon','majority','Passed']
data_set = df_merged[cols_of_interest]
data_set['lapsed_day'] = data_set.lapsed_day.astype('int')

In [24]:
data_set.corr()

Unnamed: 0,num_authors,lapsed_day,upper,mon,majority,Passed
num_authors,1.0,-0.091366,-0.080328,0.073247,-0.054739,0.334389
lapsed_day,-0.091366,1.0,0.832508,0.247615,-0.098108,0.197407
upper,-0.080328,0.832508,1.0,0.192588,-0.042831,0.2125
mon,0.073247,0.247615,0.192588,1.0,-0.005113,0.370561
majority,-0.054739,-0.098108,-0.042831,-0.005113,1.0,-0.021736
Passed,0.334389,0.197407,0.2125,0.370561,-0.021736,1.0


In [25]:
# plt.style.use('seaborn-dark')
# sns.kdeplot(df_13_17_all['lapsed_day'].astype('int'), shade=True, label='Number of Days from Filing to Passing of Law');

In [26]:
df_merged['long_title'] = df_merged['long_title'].map(word_tokenize)

In [27]:
df_merged['long_title'] = df_merged['long_title'].map(pos_tag)

In [28]:
df_merged['long_title'] = df_merged['long_title'].map(lambda words: 
                            [word for word in words if (word[1].startswith('N') or word[1].startswith('J'))])

In [29]:
stoppers = stopwords.words('english')
stoppers.extend(['act', 'presidential', 'ii', 'eo', 'code', 'government', 'provide', 'purpose', 'therefor', 'penal', 
           'thereof', 'revised', 'article', 'provision', 'amended', 'therefore', 'ng', 'ra', 'b', 'san', 'sa', 
           'del', 'pd', 'decree', 'rano', 'fund', 'program', 'national', 'law', 'republic', 'philippine', 'otherwise', 
           'section','philippines','xxii','xviii','xiv','xiii','xii','wvcst','ix','iv'])

In [30]:
df_merged['long_title']=df_merged['long_title'].map(lambda tags: 
                                                                [tag[0] for tag in tags if tag[0] not in stoppers])

In [31]:
lemmatizer = WordNetLemmatizer()
df_merged['long_title'] = df_merged['long_title'].map(lambda words: 
                                                                [lemmatizer.lemmatize(word) for word in words])

In [32]:
def allWordChar(word):
    return not(any(char.isdigit() for char in word))

In [33]:
df_merged['long_title'] = df_merged['long_title'].apply(lambda words:
                                                      [word for word in words if allWordChar(word)])

In [34]:
df_merged['processed'] = df_merged['long_title'].apply(lambda x: ' '.join(x))

In [35]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_out = tfidf_vectorizer.fit_transform(df_merged['processed'])

In [36]:
df_vect_words = pd.DataFrame(sorted(tfidf_vectorizer.vocabulary_.items(), key=lambda kv: kv[1], reverse=True),
                  columns = ['Word', 'Count'])

In [37]:
# plt.style.use('seaborn-dark')
# plt.figure(figsize=(15,6))
# plt.bar(df_vect_words['Word'].head(30), df_vect_words['Count'].head(30))

## Modeling

In [38]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [39]:
ds = pd.DataFrame(tfidf_out.todense())

In [40]:
cols_of_interest = ['num_authors','lapsed_day','upper','mon','majority','Passed']

In [53]:
ds['num_authors']=data_set['num_authors'].tolist()
ds['lapsed_day']=data_set['lapsed_day'].tolist()
ds['Passed']=data_set['Passed'].tolist()
# ds['mon']=data_set['mon'].tolist()
# ds['majority']=data_set['majority'].tolist()

In [65]:
dstest[(dstest.Passed>0.8).any()]

0             -0.003277
1              1.000000
2             -0.002481
3             -0.003510
4             -0.005990
                 ...   
1494          -0.002481
1495          -0.004494
num_authors   -0.022102
lapsed_day    -0.032719
Passed        -0.080398
Name: 1, Length: 1499, dtype: float64

In [62]:
dstest = ds.corr()
ds.corr()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1489,1490,1491,1492,1493,1494,1495,num_authors,lapsed_day,Passed
0,1.000000,-0.003277,-0.002502,-0.003538,-0.006039,-0.003406,-0.002502,-0.003155,-0.002502,-0.004283,...,-0.002502,-0.004023,-0.002502,-0.003541,-0.002502,-0.002502,-0.004531,-0.013733,0.023160,0.026201
1,-0.003277,1.000000,-0.002481,-0.003510,-0.005990,-0.003378,-0.002481,-0.003130,-0.002481,-0.004249,...,-0.002481,-0.003990,-0.002481,-0.003512,-0.002481,-0.002481,-0.004494,-0.022102,-0.032719,-0.080398
2,-0.002502,-0.002481,1.000000,-0.002679,-0.004572,-0.002579,-0.001894,-0.002389,-0.001894,-0.003243,...,-0.001894,-0.003046,-0.001894,-0.002681,-0.001894,-0.001894,-0.003430,0.053706,0.049598,0.019837
3,-0.003538,-0.003510,-0.002679,1.000000,-0.006467,-0.003647,-0.002679,-0.003379,-0.002679,-0.004587,...,-0.002679,-0.004308,-0.002679,-0.003792,-0.002679,-0.002679,-0.004852,-0.030581,0.053186,0.028057
4,-0.006039,-0.005990,-0.004572,-0.006467,1.000000,-0.006225,-0.004572,-0.005767,-0.004572,-0.007828,...,-0.004572,-0.007352,-0.004572,-0.006471,-0.004572,-0.004572,-0.008280,0.025340,-0.031661,-0.007153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1494,-0.002502,-0.002481,-0.001894,-0.002679,-0.004572,-0.002579,-0.001894,-0.002389,-0.001894,-0.003243,...,-0.001894,-0.003046,-0.001894,-0.002681,-0.001894,1.000000,-0.003430,-0.011232,-0.026155,0.019837
1495,-0.004531,-0.004494,-0.003430,-0.004852,-0.008280,-0.004670,-0.003430,-0.004327,-0.003430,-0.005873,...,-0.003430,-0.005516,-0.003430,-0.004855,-0.003430,-0.003430,1.000000,-0.029703,-0.010814,0.035926
num_authors,-0.013733,-0.022102,0.053706,-0.030581,0.025340,0.022544,-0.011232,-0.034760,-0.011232,0.017008,...,-0.032878,0.036841,-0.032878,-0.046536,-0.000409,-0.011232,-0.029703,1.000000,-0.091366,0.334389
lapsed_day,0.023160,-0.032719,0.049598,0.053186,-0.031661,-0.045887,-0.012706,-0.039014,-0.017132,-0.030861,...,-0.038922,0.028329,0.034618,0.030122,-0.007600,-0.026155,-0.010814,-0.091366,1.000000,0.197407


ModuleNotFoundError: No module named 'feature_selector'

In [None]:
fs = FeatureSelector(data = ds, labels = train_lab

In [55]:
fs.identify_collinear(correlation_threshold = 0.98)

AttributeError: 'DataFrame' object has no attribute 'identify_collinear'

In [None]:
ds.drop('Passed',axis=1,inplace=True)

In [42]:
scaler = StandardScaler()
ds_scaled = scaler.fit_transform(ds)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(ds_scaled, df_merged['Passed'], test_size = 0.25, random_state=43)
print(f"Our train set has {X_train.shape[0]:,} data points whilst our test set has {X_test.shape[0]:,} data points.")
print(f"\n{y_test.sum()/X_test.shape[0]:.2%} from the test set are actual positives.")

Our train set has 396 data points whilst our test set has 133 data points.

80.45% from the test set are actual positives.


In [44]:
# Under sample the majority target variable, so that we get about equal number of observations from both classes.
rus = RandomUnderSampler(random_state=42)
nx_train, ny_train = rus.fit_resample(X_train, y_train)
print(f"Our new train set has {ny_train.shape[0]} data points, {np.sum(ny_train)} of which are actual positives.")

Our new train set has 130 data points, 65 of which are actual positives.


In [49]:
svd = TruncatedSVD(n_components=100, algorithm='arpack', random_state = 42)
nx_train_svd = svd.fit_transform(nx_train)
x_test_svd = svd.transform(X_test)

In [50]:
svc = SVC(gamma='auto', kernel='linear', random_state = 42)
svc.fit(nx_train_svd, ny_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [51]:
predicted = svc.predict(x_test_svd)

In [52]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

       False       0.30      0.54      0.39        26
        True       0.86      0.70      0.77       107

    accuracy                           0.67       133
   macro avg       0.58      0.62      0.58       133
weighted avg       0.75      0.67      0.70       133

