# Exploratory Data Analytics on Philippine Bills from 13th Congress to 17th Congress

## Importing Python Packages

In [32]:
# For loading, manipulating dataframe.
import pandas as pd
import sqlite3, datetime, re

# For Text Preprocessing
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Visualizations
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# Hide Warning messages
import warnings
warnings.filterwarnings('ignore')

In [33]:
con = sqlite3.connect("phBills.db")
df = pd.read_sql_query("SELECT * from senateBills", con, index_col = None, parse_dates = ['date_filed','date_lastUpdate'])
con.close()

In [34]:
df_senators = pd.read_excel('Bills with Authors.xlsx',sheet_name='Sheet2')

In [35]:
df['Passed']=np.where(df.ra!='',True, False)
df['long_title']=df['long_title'].str.lower()
df_13_17_all = df[df['congress']!=18]
df_num_rows, df_num_col = df.shape
print(f"There are {df_num_rows} records and {df_num_col} features from our original data frame.")
bill_passage_rate = (df_13_17_all['ra']!='').sum()/((df_13_17_all['ra']!='').sum() + (df_13_17_all['ra']=='').sum())
print(f"From 13th Congress to 17th Congress, the PH Bill Passage Percentage is {(bill_passage_rate*100):.2f}%")

There are 16284 records and 16 features from our original data frame.
From 13th Congress to 17th Congress, the PH Bill Passage Percentage is 2.89%


In [36]:
df_merged = pd.merge(df_13_17_all,df_senators[['bill_id','Bloc','Party']],how='left',on=['bill_id'])

In [37]:
df_merged[df_merged['Passed']==True].Party.value_counts()

Independent        97
Nacionalista       74
NPC                58
Liberal            58
PMP                40
LDP                34
PDP–Laban          16
PRP                15
Lakas-Kampi-CMD    12
Lakas               7
UNA                 7
PDP-Laban           6
Bagumbayan-VNP      6
Lakas-CMD           5
Akbayan             3
Name: Party, dtype: int64

In [38]:
df_merged=df_merged[df_merged['status']!='Withdrawn']

In [39]:
df_merged['num_authors']=((df_merged.author.str.count(',') + 1)/2).astype('int')

In [40]:
df_merged['lapsed_day']=np.where(df_merged.Passed==True,df_merged.date_lastUpdate-df_merged.date_filed,
                                    pd.to_datetime('today')-df_merged.date_filed)

In [41]:
df_merged.Bloc.fillna('Majority',inplace=True)

In [42]:
passed_ave = df_merged[df_merged['Passed']==True]['lapsed_day'].median()
df_merged['upper']=np.where(df_merged.lapsed_day>passed_ave,True,False)

In [43]:
df_merged[df_merged['Passed']==True]['lapsed_day'].median()

Timedelta('306 days 12:00:00')

In [44]:
df_merged[df_merged['Passed']==True]['lapsed_day'].mean()

Timedelta('374 days 15:30:24.657534')

In [45]:
df_merged[df_merged['Passed']==True]['lapsed_day'].mode()[0]

Timedelta('546 days 00:00:00')

In [46]:
df_merged['mon']=df_merged.date_filed.apply(lambda x: x.month)

In [47]:
df_merged[df_merged['Passed']==True].mon.value_counts()

11    70
5     53
9     44
12    40
1     40
6     34
2     34
7     33
10    30
3     30
8     26
4      4
Name: mon, dtype: int64

In [48]:
print(df_merged[df_merged['Passed']==True].scope.value_counts(dropna=False))
print(f"\nOnly {(((df_merged.Passed==True)&\
(df_merged.scope=='Local')).sum())/((df_merged.Passed==True).sum()):.2%} of local senate bills are passed.")

National    421
Local        17
Name: scope, dtype: int64

Only 3.88% of local senate bills are passed.


In [49]:
df_merged.Bloc.value_counts(dropna=False)

Majority    9860
Minority    5218
Name: Bloc, dtype: int64

In [50]:
df_merged[df_merged['Passed']==True].Bloc.value_counts(dropna=False)

Majority    351
Minority     87
Name: Bloc, dtype: int64

In [51]:
df_merged['majority']=np.where(df_merged['Bloc']=='Majority',True,False)

In [53]:
cols_of_interest = ['num_authors','lapsed_day','upper','mon','majority','Passed']
data_set = df_merged[cols_of_interest]
data_set['lapsed_day'] = data_set.lapsed_day.astype('int')

In [54]:
data_set.corr()

Unnamed: 0,num_authors,lapsed_day,upper,mon,majority,Passed
num_authors,1.0,-0.198671,-0.392653,-0.029448,0.042476,0.513843
lapsed_day,-0.198671,1.0,0.275344,0.008457,-0.149643,-0.320637
upper,-0.392653,0.275344,1.0,0.069507,-0.036618,-0.606577
mon,-0.029448,0.008457,0.069507,1.0,0.01135,0.006364
majority,0.042476,-0.149643,-0.036618,0.01135,1.0,0.053608
Passed,0.513843,-0.320637,-0.606577,0.006364,0.053608,1.0


In [55]:
# plt.style.use('seaborn-dark')
# sns.kdeplot(df_13_17_all['lapsed_day'].astype('int'), shade=True, label='Number of Days from Filing to Passing of Law');

In [56]:
df_merged['long_title'] = df_merged['long_title'].map(word_tokenize)

In [57]:
df_merged['long_title'] = df_merged['long_title'].map(pos_tag)

In [58]:
df_merged['long_title'] = df_merged['long_title'].map(lambda words: 
                            [word for word in words if (word[1].startswith('N') or word[1].startswith('J'))])

In [59]:
stoppers = stopwords.words('english')
stoppers.extend(['act', 'presidential', 'ii', 'eo', 'code', 'government', 'provide', 'purpose', 'therefor', 'penal', 
           'thereof', 'revised', 'article', 'provision', 'amended', 'therefore', 'ng', 'ra', 'b', 'san', 'sa', 
           'del', 'pd', 'decree', 'rano', 'fund', 'program', 'national', 'law', 'republic', 'philippine', 'otherwise', 
           'section','philippines','xxii','xviii','xiv','xiii','xii','wvcst','ix','iv'])

In [60]:
df_merged['long_title']=df_merged['long_title'].map(lambda tags: 
                                                                [tag[0] for tag in tags if tag[0] not in stoppers])

In [62]:
lemmatizer = WordNetLemmatizer()
df_merged['long_title'] = df_merged['long_title'].map(lambda words: 
                                                                [lemmatizer.lemmatize(word) for word in words])

In [63]:
def allWordChar(word):
    return not(any(char.isdigit() for char in word))

In [64]:
df_merged['long_title'] = df_merged['long_title'].apply(lambda words:
                                                      [word for word in words if allWordChar(word)])

In [65]:
df_merged['processed'] = df_merged['long_title'].apply(lambda x: ' '.join(x))

In [66]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_out = tfidf_vectorizer.fit_transform(df_merged['processed'])

In [67]:
df_vect_words = pd.DataFrame(sorted(tfidf_vectorizer.vocabulary_.items(), key=lambda kv: kv[1], reverse=True),
                  columns = ['Word', 'Weight'])

## EDA

In [68]:
# plt.style.use('seaborn-dark')
# plt.figure(figsize=(15,6))
# plt.bar(df_vect_words['Word'].head(30), df_vect_words['Count'].head(30))

## Modeling

In [69]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [70]:
ds = pd.DataFrame(tfidf_out.todense())

In [71]:
cols_of_interest = ['num_authors','lapsed_day','upper','mon','majority','Passed']

In [72]:
ds['num_authors']=data_set['num_authors'].tolist()
ds['upper']=data_set['upper'].tolist()
# ds['mon']=data_set['mon'].tolist()
# ds['majority']=data_set['majority'].tolist()

In [73]:
scaler = StandardScaler()

In [74]:
ds_scaled = scaler.fit_transform(ds)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(ds_scaled, df_merged['Passed'], test_size = 0.25, random_state=43)
print(f"Our train set has {X_train.shape[0]:,} data points whilst our test set has {X_test.shape[0]:,} data points.")
print(f"\n{y_test.sum()/X_test.shape[0]:.2%} from the test set are actual positives.")

Our train set has 11,308 data points whilst our test set has 3,770 data points.

3.10% from the test set are actual positives.


In [76]:
# Under sample the majority target variable, so that we get about equal number of observations from both classes.
rus = RandomUnderSampler(random_state=42)
nx_train, ny_train = rus.fit_resample(X_train, y_train)
print(f"Our new train set has {ny_train.shape[0]} data points, {np.sum(ny_train)} of which are actual positives.")

Our new train set has 642 data points, 321 of which are actual positives.


In [77]:
svd = TruncatedSVD(n_components=20, algorithm='arpack', random_state = 42)
nx_train_svd = svd.fit_transform(nx_train)
x_test_svd = svd.transform(X_test)

In [78]:
svc = SVC(gamma='auto', kernel='linear', random_state = 42)
svc.fit(nx_train_svd, ny_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [79]:
predicted = svc.predict(x_test_svd)

In [80]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

       False       0.99      0.94      0.97      3653
        True       0.27      0.68      0.39       117

    accuracy                           0.93      3770
   macro avg       0.63      0.81      0.68      3770
weighted avg       0.97      0.93      0.95      3770

