# Pre-processing Bill's Text Data

## Import Python Packages

In [1]:
# For loading, manipulating dataframe.
import pandas as pd
import sqlite3

# For Text Preprocessing
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

# For visualization
from wordcloud import WordCloud
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns


# ML Modeling packages
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection as ms
import sklearn.metrics
from sklearn.metrics import classification_report

# Hide Warning messages
import warnings
warnings.filterwarnings('ignore')

## Load Philippines' Bills Data

In [2]:
con = sqlite3.connect("phBills.db")
df = pd.read_sql_query("SELECT * from senateBills", con, index_col = 'bill_id', parse_dates = ['date_filed','date_lastUpdate'])
con.close()

In [3]:
df['Passed']=np.where(df.ra!='',True, False)
df['long_title']=df['long_title'].str.lower()
df_13_17_all = df[df['congress']!=18]
df_num_rows, df_num_col = df.shape
print(f"There are {df_num_rows} records and {df_num_col} features from our original data frame.")
bill_passage_rate = (df_13_17_all['ra']!='').sum()/((df_13_17_all['ra']!='').sum() + (df_13_17_all['ra']=='').sum())
print(f"From 13th Congress to 17th Congress, the PH Bill Passage Rate is {(bill_passage_rate*100):.2f}%")

There are 16284 records and 15 features on our original data frame.
From 13th Congress to 17th Congress, the PH Bill Passage Rate is 2.89%


## Text Preprocessing

### Tokenizer

In [4]:
df_13_17_all['long_title'] = df_13_17_all['long_title'].map(word_tokenize)

### Lemmatizer

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
df_13_17_all['long_title_lem'] = df_13_17_all['long_title'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df_13_17_all['long_title_lem'] = df_13_17_all['long_title_lem'].apply(lambda x: ' '.join(x))

## Machine Learning Modeling

### Preparing our Data Sets

#### Vectorizing Train Text Feature

In [7]:
vectorizer_model = TfidfVectorizer(min_df=10, max_df=0.90)
x_matrix = vectorizer_model.fit_transform(df_13_17_all['long_title_lem'].values.astype('U'))

#### Splitting Data Into Train & Test Sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x_matrix, df_13_17_all['Passed'], test_size = 0.25, random_state=43)
print(f"Our train set has {X_train.shape[0]:,} data points whilst our test set has {X_test.shape[0]:,} data points.")

Our train set has 11,348 data points whilst our test set has 3,783 data points.


#### Sampling Technique

In [9]:
# Under sample the majority target variable, so that we get about equal number of observations from both classes.
rus = RandomUnderSampler(random_state=42)
nx_train, ny_train = rus.fit_resample(X_train, y_train)
print(f"Our new train set has {ny_train.shape[0]} data points, {np.sum(ny_train)} of which are actual positives.")

Our new train set has 644 data points, 322 of which are actual positives.


#### Truncated SVD to Reduce Dimension

In [10]:
svd = TruncatedSVD(n_components=400, algorithm='arpack', random_state = 42)
nx_train_svd = svd.fit_transform(nx_train)
x_test_svd = svd.transform(X_test)

### Using SVC

In [11]:
svc = SVC(gamma='auto', kernel='linear', random_state = 42)

#### Fit Model Using Train Set

In [12]:
svc.fit(nx_train_svd, ny_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

#### Predictions Using New Model

##### On Train Set

In [13]:
predicted_train = svc.predict(nx_train_svd)

##### On Test Set

In [14]:
predicted = svc.predict(x_test_svd)

#### Model Evaluation

##### On Train Set

In [15]:
print(classification_report(ny_train, predicted_train))

              precision    recall  f1-score   support

       False       0.85      0.85      0.85       322
        True       0.85      0.85      0.85       322

    accuracy                           0.85       644
   macro avg       0.85      0.85      0.85       644
weighted avg       0.85      0.85      0.85       644



##### On Test Set

In [16]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

       False       0.98      0.66      0.79      3667
        True       0.05      0.57      0.09       116

    accuracy                           0.66      3783
   macro avg       0.52      0.61      0.44      3783
weighted avg       0.95      0.66      0.77      3783



### Using Decision Tree Classifier

In [17]:
tree_model = DecisionTreeClassifier(random_state=42)

#### Fit Model Using Train Set

In [18]:
tree_model.fit(nx_train_svd, ny_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

#### Predictions Using New Model

In [19]:
predicted = tree_model.predict(x_test_svd)

#### Model Evaluation

In [20]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

       False       0.98      0.63      0.76      3667
        True       0.05      0.58      0.09       116

    accuracy                           0.63      3783
   macro avg       0.51      0.60      0.43      3783
weighted avg       0.95      0.63      0.74      3783



### Using Random Forrest Classifier

In [21]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

#### Fit Model Using Train Set

In [22]:
rf_model.fit(nx_train_svd, ny_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

#### Predictions Using New Model

In [23]:
predicted = rf_model.predict(x_test_svd)

#### Model Evaluation

In [24]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

       False       0.98      0.38      0.55      3667
        True       0.04      0.78      0.07       116

    accuracy                           0.39      3783
   macro avg       0.51      0.58      0.31      3783
weighted avg       0.95      0.39      0.54      3783

