# Prediction Model –  Using Text Features to Predict Bill's Passage

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
df_billText = pd.read_csv('billText_Processed_n.csv',index_col=0)
df_billText.head()

Unnamed: 0,bill_id,long_title,bill_status
0,17SBN-2235,fiscal regime mining industry,Pending
1,17SBN-2234,sale certain land barangay city university cit...,Passed
2,17SBN-2233,excise tax tobacco tax incremental tobacco exc...,Passed
3,17SBN-2232,institutionalization development training orga...,Pending
4,17SBN-2231,bank,Pending


In [3]:
df_billText.bill_status.value_counts()

Pending     14167
Passed        422
Archived       89
Name: bill_status, dtype: int64

In [4]:
df_billText = df_billText[df_billText.bill_status!='Pending']

In [5]:
df_billText['target']=np.where(df_billText.bill_status=='Passed',True,False)

In [6]:
df_billText.head()

Unnamed: 0,bill_id,long_title,bill_status,target
1,17SBN-2234,sale certain land barangay city university cit...,Passed,True
2,17SBN-2233,excise tax tobacco tax incremental tobacco exc...,Passed,True
7,17SBN-2228,nature park barangay city province responsible...,Passed,True
40,17SBN-2195,court community service lieu imprisonment chap...,Passed,True
47,17SBN-2188,bed capacity memorial hospital medical center ...,Passed,True


In [7]:
df_billText.target.value_counts(dropna=False)

True     422
False     89
Name: target, dtype: int64

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_v = tfidf.fit_transform(df_billText.long_title)

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_v, df_billText.target, test_size = 0.25, random_state=43)
print(f"Our train set has {X_train.shape[0]:,} data points whilst our test set has {X_test.shape[0]:,} data points.")
print(f"\n{y_test.sum()/X_test.shape[0]:.2%} from the test set are actual positives.")

Our train set has 383 data points whilst our test set has 128 data points.

78.12% from the test set are actual positives.


In [12]:
print(f"\n{y_train.sum()/X_train.shape[0]:.2%} from the test set are actual positives.")


84.07% from the test set are actual positives.


In [13]:
from imblearn.under_sampling import RandomUnderSampler 

In [14]:
rus = RandomUnderSampler(random_state=42)
nx, ny = rus.fit_resample(X_train, y_train)
print(f"Our new data set has {ny.shape[0]} data points.")

Our new data set has 122 data points.


In [15]:
from sklearn.decomposition import TruncatedSVD

In [16]:
svd = TruncatedSVD(n_components=20, algorithm='arpack', random_state = 42)
nx_train_svd = svd.fit_transform(nx)
x_test_svd = svd.transform(X_test)

In [17]:
from sklearn.svm import SVC

In [18]:
svc = SVC(gamma='auto', kernel='linear', random_state = 42)
svc.fit(nx_train_svd, ny)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [19]:
predicted = svc.predict(x_test_svd)

In [20]:
from sklearn.metrics import classification_report

In [21]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

       False       0.27      0.89      0.41        28
        True       0.91      0.32      0.47       100

    accuracy                           0.45       128
   macro avg       0.59      0.61      0.44       128
weighted avg       0.77      0.45      0.46       128

