# Modeling – Passed and Archived Bills Only Using 'Party' Feature

## Importing Python Packages

In [1]:
# For loading, manipulating dataframe.
import pandas as pd
import sqlite3, datetime, sklearn

# Visualizations
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# Text Preprocessing
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

# Sampling
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler 

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Feature Processing 
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

# Modeling
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

# Hide Warning messages
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_congress = pd.read_csv('congress_data_set.csv',index_col=0)

In [3]:
df_congress.shape

(15078, 22)

In [4]:
df_congress.head(2)

Unnamed: 0,bill_id,num,congress,long_title,date_filed,scope,status,author,date_lastUpdate,passed,...,Bloc,Years of Service,num_authors,delta_days,upper,mon,quarter,scope_national,majority_bloc,len_desc
0,17SBN-2235,SBN-2235,17,an act establishing the fiscal regime for the ...,2019-05-28,National,"Pending Second Reading, Special Order","Drilon, Franklin M., Recto, Ralph G., Sotto II...",2019-05-28,False,...,Minority,24.0,4,0.0,False,5,2.0,True,False,61
1,17SBN-2234,SBN-2234,17,an act authorizing the sale of certain parcels...,2019-05-27,National,Approved by the President of the Philippines,"Recto, Ralph G., Sotto III, Vicente C., Escude...",2019-06-30,True,...,Majority,18.0,3,34.0,False,5,2.0,True,True,300


In [5]:
# df_congress=df_congress[(df_congress.status=='Sent to the Archives') | (df_congress.passed==True)]
print(f"df_congress Data Points: {df_congress.shape[0]:,}")

df_congress Data Points: 15,078


In [6]:
df_congress.columns

Index(['bill_id', 'num', 'congress', 'long_title', 'date_filed', 'scope',
       'status', 'author', 'date_lastUpdate', 'passed',
       'Full Name Primary Author', 'Party', 'Bloc', 'Years of Service',
       'num_authors', 'delta_days', 'upper', 'mon', 'quarter',
       'scope_national', 'majority_bloc', 'len_desc'],
      dtype='object')

In [7]:
cols_of_interest = ['Party','Years of Service','num_authors','delta_days','upper', 'mon', 'quarter',
       'scope_national', 'majority_bloc', 'len_desc','passed']
df_tocorr = df_congress[cols_of_interest]

In [8]:
df_tocorr.shape

(15078, 11)

In [9]:
columnsToEncode=df_tocorr.select_dtypes(include=[object]).columns
df_tocorr = pd.get_dummies(df_tocorr, columns=columnsToEncode, drop_first=True)

In [10]:
df_tocorr.shape

(15078, 24)

In [11]:
pd.options.display.max_columns = None

In [12]:
df_tocorr.corr()

Unnamed: 0,Years of Service,num_authors,delta_days,upper,mon,quarter,scope_national,majority_bloc,len_desc,passed,Party_Bagumbayan-VNP,Party_Independent,Party_LDP,Party_Lakas,Party_Lakas-CMD,Party_Lakas-Kampi-CMD,Party_Liberal,Party_NPC,Party_Nacionalista,Party_PDP-Laban,Party_PDP–Laban,Party_PMP,Party_PRP,Party_UNA
Years of Service,1.0,0.061176,0.050841,0.051702,-0.025919,0.005906,0.100215,0.045473,-0.122234,0.047726,-0.06031,-0.20845,-0.04221,-0.029496,-0.083944,-0.145623,-0.093199,0.143544,-0.117751,0.140967,-0.024637,-0.188851,0.556195,-0.062415
num_authors,0.061176,1.0,0.115971,0.088836,-0.029448,-0.05148,0.001802,0.035937,0.045183,0.513843,0.034861,-0.018207,0.019813,-0.01656,0.014943,-0.006164,0.069302,0.043443,0.031089,-0.018218,0.008902,-0.018955,-0.073148,0.002358
delta_days,0.050841,0.115971,1.0,0.906815,0.014041,0.020326,-0.023326,0.033741,0.071377,0.22522,0.016334,0.013784,0.027818,-0.007884,0.004537,0.004736,0.036637,0.037379,0.05284,0.002106,0.015824,-0.015437,-0.103853,-0.022736
upper,0.051702,0.088836,0.906815,1.0,0.012563,0.014753,-0.011375,0.02824,0.062537,0.187966,0.013875,0.018845,0.018887,-0.007798,0.008354,0.000417,0.034312,0.036782,0.044603,-0.012029,0.013991,-0.01649,-0.090336,-0.00967
mon,-0.025919,-0.029448,0.014041,0.012563,1.0,0.940177,-0.107234,0.033186,0.020488,0.006364,-0.015743,0.027458,-0.037882,-0.020462,0.011969,0.002547,-0.023047,-0.000888,-0.008337,-0.053537,-0.055045,0.019026,0.047014,-0.002121
quarter,0.005906,-0.05148,0.020326,0.014753,0.940177,1.0,-0.085505,0.052842,0.002321,-0.015706,-0.026212,-0.012211,-0.04638,-0.000227,-0.018581,-0.001831,-0.030049,0.012601,0.04967,-0.11706,-0.060324,0.027388,0.077969,-0.015473
scope_national,0.100215,0.001802,-0.023326,-0.011375,-0.107234,-0.085505,1.0,-0.068267,-0.150979,-0.000251,-0.008296,-0.13573,0.002002,-0.002467,-0.026221,-0.028663,0.015063,0.001945,-0.004119,0.004571,-0.023674,0.064498,0.086782,0.019944
majority_bloc,0.045473,0.035937,0.033741,0.02824,0.033186,0.052842,-0.068267,1.0,0.048924,0.046367,0.05704,0.222837,0.057747,0.017906,0.105664,0.113565,-0.002651,0.184651,-0.100718,-0.208826,0.084239,-0.186846,-0.126782,-0.034047
len_desc,-0.122234,0.045183,0.071377,0.062537,0.020488,0.002321,-0.150979,0.048924,1.0,0.081711,-0.006845,0.04894,0.012716,0.017081,0.027925,0.016176,0.057267,0.039555,0.038458,0.025164,0.04677,0.042824,-0.243092,0.013602
passed,0.047726,0.513843,0.22522,0.187966,0.006364,-0.015706,-0.000251,0.046367,0.081711,1.0,0.014214,0.020719,0.031227,-0.013351,-0.013608,0.000646,0.034432,0.044413,0.026381,-0.018938,0.030826,-0.032079,-0.071327,-0.003395


In [13]:
X = df_tocorr.drop('passed',axis=1)

In [14]:
X.shape

(15078, 23)

In [15]:
y = df_tocorr['passed']

In [16]:
y.shape

(15078,)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=43)

In [18]:
print(f"Our new train set has {X_train.shape[0]} data points, {np.sum(y_train)} of which are actual positives.")

Our new train set has 11308 data points, 321 of which are actual positives.


In [19]:
svc = SVC(gamma='auto', kernel='linear', random_state = 42)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [20]:
predicted = svc.predict(X_test)

In [21]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

       False       0.97      1.00      0.99      3653
        True       0.69      0.17      0.27       117

    accuracy                           0.97      3770
   macro avg       0.83      0.58      0.63      3770
weighted avg       0.97      0.97      0.96      3770

