# Data Cleaning on Senate Bills from 13th Congress to 17th Congress

## Importing Python Packages

In [1]:
# For loading, manipulating dataframe.
import pandas as pd
import sqlite3, datetime, re

# For Text Preprocessing
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Visualizations
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# Modeling
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Hide Warning messages
import warnings
warnings.filterwarnings('ignore')

In [2]:
con = sqlite3.connect("phBills.db")
df_bills = pd.read_sql_query("SELECT * from senateBills", con, index_col = None, parse_dates = ['date_filed','date_lastUpdate'])
con.close()

In [3]:
df_senators = pd.read_excel('Bills_18congress_w_authors.xlsx',sheet_name='Sheet2')

In [4]:
df_bills['passed']=np.where(df_bills.ra!='',True, False)
df_bills['long_title']=df_bills['long_title'].str.lower()
df_bills = df_bills[df_bills['congress']==18]

In [5]:
df_bills.head(2)

Unnamed: 0,bill_id,num,link,congress,short_title,long_title,date_filed,scope,status,author,subject,pri_committee,date_lastUpdate,logs,ra,passed
0,18SBN-1153,SBN-1153,http://www.senate.gov.ph/lis/bill_res.aspx?con...,18,"AMENDING SEC. 65 OF R.A. 11260, GENERAL APPROP...",an act amending section 65 of republic act no....,2019-11-06,National,Pending in the Committee,"Angara, Sonny",General Appropriations Act (GAA),Finance,2019-11-06,11/6/2019\nIntroduced by Senator SONNY ANGARA;...,,False
1,18SBN-1152,SBN-1152,http://www.senate.gov.ph/lis/bill_res.aspx?con...,18,PROFESSIONAL FILIPINO ATHLETES INSURANCE BENEF...,an act providing insurance coverage to profess...,2019-11-06,National,Pending in the Committee,"Lapid, Manuel ""Lito"" M.",AthletesInsuranceGames and Amusements Board,Sports,2019-11-06,"11/6/2019\nIntroduced by Senator MANUEL ""LITO""...",,False


In [6]:
df_bills[df_bills.bill_id=='18SBN-1114']

Unnamed: 0,bill_id,num,link,congress,short_title,long_title,date_filed,scope,status,author,subject,pri_committee,date_lastUpdate,logs,ra,passed
39,18SBN-1114,SBN-1114,http://www.senate.gov.ph/lis/bill_res.aspx?con...,18,GREEN ENERGY EDUCATION ACT OF 2019,an act authorizing higher education curriculum...,2019-10-14,National,Withdrawn,"Revilla Jr., Ramon Bong",,,2019-10-28,10/14/2019\nIntroduced by Senator RAMON BONG R...,,False


In [7]:
df_senators[df_senators.bill_id=='18SBN-1114']

Unnamed: 0,Index,bill_id,num,congress,scope,Status2,ra,# of Authors,Primary Auth S,Primary Auth F,...,Party,# of Terms,Term Starts,Term Ends,Bloc,City/municipality of registration,Congress starts,Filing Year,Senate Service start,Years of Service
39,39,18SBN-1114,SBN-1114,18,National,Others,,1,Revilla Jr,Ramon,...,Lakas,1,"June 30, 2019","June 30, 2025",Majority,Bacoor,2004,2019,2004,15


In [8]:
df_bills.status.value_counts()

Pending in the Committee                            1112
Consolidated/Substituted in the Committee Report      27
Pending Second Reading, Special Order                  6
Withdrawn                                              4
Approved on  Second Reading, with Amendments           2
Pending Conference Committee                           1
Pending Second Reading, Ordinary Business              1
Name: status, dtype: int64

In [9]:
df_bills.drop(['link','short_title','subject','pri_committee','ra','logs'],axis=1,inplace=True)

In [10]:
df_bills.isna().any()

bill_id            False
num                False
congress           False
long_title         False
date_filed         False
scope              False
status             False
author             False
date_lastUpdate    False
passed             False
dtype: bool

In [11]:
df_senators=df_senators[['bill_id','Full Name Primary Author','Party','Bloc','Years of Service']]

In [12]:
df_senators.head(2)

Unnamed: 0,bill_id,Full Name Primary Author,Party,Bloc,Years of Service
0,18SBN-1153,Eduardo Angara,LDP,Majority,15
1,18SBN-1152,Lito Lapid,NPC,Majority,15


In [13]:
df_senators[df_senators.bill_id=='18SBN-1114']

Unnamed: 0,bill_id,Full Name Primary Author,Party,Bloc,Years of Service
39,18SBN-1114,Ramon Revilla Jr,Lakas,Majority,15


In [14]:
print(f"Bills Data Points: {df_bills.shape[0]:,}\nSenators Data Points: {df_senators.shape[0]:,}")

Bills Data Points: 1,153
Senators Data Points: 1,153


In [15]:
df_bills[df_bills.bill_id=='18SBN-1114']

Unnamed: 0,bill_id,num,congress,long_title,date_filed,scope,status,author,date_lastUpdate,passed
39,18SBN-1114,SBN-1114,18,an act authorizing higher education curriculum...,2019-10-14,National,Withdrawn,"Revilla Jr., Ramon Bong",2019-10-28,False


In [16]:
df_congress = pd.merge(df_bills,df_senators,how='left',on=['bill_id'])
print(f"Congress DF Data Points: {df_congress.shape[0]:,}")

Congress DF Data Points: 1,153


In [17]:
df_congress[df_congress.bill_id=='18SBN-1114']

Unnamed: 0,bill_id,num,congress,long_title,date_filed,scope,status,author,date_lastUpdate,passed,Full Name Primary Author,Party,Bloc,Years of Service
39,18SBN-1114,SBN-1114,18,an act authorizing higher education curriculum...,2019-10-14,National,Withdrawn,"Revilla Jr., Ramon Bong",2019-10-28,False,Ramon Revilla Jr,Lakas,Majority,15


In [None]:
df_congress=df_congress[df_congress.status!='Withdrawn']

In [None]:
df_congress.isna().sum()

In [None]:
df_congress.loc[df_congress['Full Name Primary Author']=='Mannuel Villar Jr','Years of Service'] = df_congress.date_filed.apply(
                                                        lambda x: x.year - 2001)
df_congress.loc[df_congress['Full Name Primary Author']=='Mannuel Villar Jr','Party'] = 'Independent'
df_congress.loc[df_congress['Full Name Primary Author']=='Mannuel Villar Jr','Bloc'] = 'Majority'

In [None]:
df_congress.isna().sum()

In [None]:
df_congress.Party.value_counts(dropna=False)

In [None]:
df_congress.Bloc.value_counts(dropna=False)

In [None]:
print(f"df_congress Data Points: {df_congress.shape[0]:,}")

In [None]:
df_congress['num_authors']=((df_congress.author.str.count(',') + 1)/2).astype('int')

In [None]:
df_congress['delta_days']=df_congress.date_lastUpdate-df_congress.date_filed

In [None]:
df_congress['delta_days']=df_congress['delta_days'].astype('timedelta64[D]')

In [None]:
df_congress['mon']=df_congress.date_filed.apply(lambda x: x.month)

In [None]:
df_congress.loc[(df_congress.mon.isin([1,2,3])), 'quarter']= 1
df_congress.loc[df_congress.mon.isin([4,5,6]), 'quarter']= 2
df_congress.loc[df_congress.mon.isin([7,8,9]), 'quarter']= 3
df_congress.loc[df_congress.mon.isin([10,11,12]), 'quarter']= 4

In [None]:
df_congress[df_congress['passed']==True].mon.value_counts(sort=False)

In [None]:
df_congress[df_congress['passed']==True].quarter.value_counts(sort=False)

In [None]:
df_congress.scope.value_counts()

In [None]:
df_congress['scope_national']=np.where(df_congress.scope=='National',True,False)

In [None]:
df_congress.scope_national.value_counts()

In [None]:
df_congress[df_congress['passed']==True].Bloc.value_counts(dropna=False)

In [None]:
df_congress[df_congress['passed']==False].Bloc.value_counts(dropna=False)

In [None]:
df_congress['majority_bloc']=np.where(df_congress['Bloc']=='Majority',True,False)

In [None]:
df_congress.dtypes

In [None]:
df_congress['len_desc']=df_congress.long_title.str.len()

In [None]:
df_congress.head(2)

In [None]:
df_congress.to_csv(r'congress18_data_set.csv')