# Prediction Modeling – Using Topics and Contextual Data

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

### Load Text Data

In [2]:
df_text = pd.read_pickle(os.path.join('data_files', 'lt_processed.pkl'))

In [3]:
df_text.shape

(14786, 3)

In [4]:
df_text.head(2)

Unnamed: 0,bill_id,long_title,bill_status
0,17SBN-2235,"[fiscal, regime, mining, industry]",Pending
1,17SBN-2234,"[sale, certain, land, barangay, na, ligas, cit...",Passed


Drop Pending Senate Bills

In [5]:
df_text.bill_status.value_counts()

Pending     14270
Passed        427
Archived       89
Name: bill_status, dtype: int64

In [6]:
df_text.drop(df_text[df_text.bill_status=='Pending'].index, inplace=True)

Set Target Variable 'Pass'

In [7]:
df_text['Pass'] = np.where(df_text.bill_status == 'Passed', True, False)

In [8]:
df_text.drop(['bill_status'], axis=1, inplace=True)

In [9]:
df_text.shape

(516, 3)

In [10]:
df_text.head()

Unnamed: 0,bill_id,long_title,Pass
1,17SBN-2234,"[sale, certain, land, barangay, na, ligas, cit...",True
2,17SBN-2233,"[excise, tax, tobacco, subject, tax, increment...",True
7,17SBN-2228,"[nature, park, barangay, city, province, bataa...",True
40,17SBN-2195,"[court, community, service, lieu, imprisonment...",True
47,17SBN-2188,"[bed, capacity, memorial, hospital, medical, c...",True


### Load Context Data

In [11]:
df_context = pd.read_pickle(os.path.join('data_files', 'encoded.pkl'))

In [12]:
df_context.drop('Pass', axis=1, inplace=True)
df_context.shape

(529, 31)

### Merged Data: Text + Context

In [13]:
df = pd.merge(df_text, df_context, how='left', on='bill_id')

In [14]:
df.shape

(516, 33)

In [15]:
df.head(2)

Unnamed: 0,bill_id,long_title,Pass,Years of Service,num_authors,delta_days,scope_national,majority_bloc,len_desc,Party_Bagumbayan-VNP,...,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
0,17SBN-2234,"[sale, certain, land, barangay, na, ligas, cit...",True,18,3,34,True,True,300,0,...,0,0,0,0,0,0,1,0,0,0
1,17SBN-2233,"[excise, tax, tobacco, subject, tax, increment...",True,3,6,34,True,True,426,0,...,0,0,0,0,0,0,1,0,0,0


### Load Topic Modeling Results

In [16]:
topic_dict = pickle.load(open(os.path.join('data_files',
                                           'topic_dict.pkl'), 'rb'))

In [17]:
def topic_mapper(lst1, lst2):
    intersect_list = list(set(lst1) & set(lst2))
    return 1 if len(intersect_list) > 0 else 0

In [18]:
df['t_0'] = df.long_title.map(lambda x: topic_mapper(x, topic_dict[0]))
df['t_1'] = df.long_title.map(lambda x: topic_mapper(x, topic_dict[1]))
df['t_2'] = df.long_title.map(lambda x: topic_mapper(x, topic_dict[2]))
df['t_3'] = df.long_title.map(lambda x: topic_mapper(x, topic_dict[3]))
df['t_4'] = df.long_title.map(lambda x: topic_mapper(x, topic_dict[4]))
df['t_5'] = df.long_title.map(lambda x: topic_mapper(x, topic_dict[5]))
df['t_6'] = df.long_title.map(lambda x: topic_mapper(x, topic_dict[6]))
df['t_7'] = df.long_title.map(lambda x: topic_mapper(x, topic_dict[7]))
df['t_8'] = df.long_title.map(lambda x: topic_mapper(x, topic_dict[8]))
df['t_9'] = df.long_title.map(lambda x: topic_mapper(x, topic_dict[9]))

In [21]:
print(f" N-Topic 0 Obs: {df.t_0.sum()}\n",
      f"N-Topic 1 Obs: {df.t_1.sum()}\n",
      f"N-Topic 2 Obs: {df.t_2.sum()}\n",
      f"N-Topic 3 Obs: {df.t_3.sum()}\n",
      f"N-Topic 4 Obs: {df.t_4.sum()}\n",
      f"N-Topic 5 Obs: {df.t_5.sum()}\n",
      f"N-Topic 6 Obs: {df.t_6.sum()}\n",
      f"N-Topic 7 Obs: {df.t_7.sum()}\n",
      f"N-Topic 8 Obs: {df.t_8.sum()}\n",
      f"N-Topic 9 Obs: {df.t_9.sum()}")

 N-Topic 0 Obs: 261
 N-Topic 1 Obs: 227
 N-Topic 2 Obs: 77
 N-Topic 3 Obs: 221
 N-Topic 4 Obs: 259
 N-Topic 5 Obs: 59
 N-Topic 6 Obs: 256
 N-Topic 7 Obs: 226
 N-Topic 8 Obs: 37
 N-Topic 9 Obs: 249


### Combined Data: Merged Data + Topic

In [24]:
df.drop(['bill_id', 'long_title'], axis=1, inplace=True)

In [25]:
df.head()

Unnamed: 0,Pass,Years of Service,num_authors,delta_days,scope_national,majority_bloc,len_desc,Party_Bagumbayan-VNP,Party_Independent,Party_LDP,...,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,t_9
0,True,18,3,34,True,True,300,0,0,0,...,1,0,0,0,1,1,0,0,1,0
1,True,3,6,34,True,True,426,0,0,0,...,1,1,1,1,1,0,1,1,0,1
2,True,6,3,40,True,True,201,0,0,0,...,1,1,0,1,1,0,1,1,0,1
3,True,12,4,151,True,False,258,0,0,0,...,0,1,1,0,0,0,0,0,0,0
4,True,9,2,73,False,True,354,0,0,0,...,1,1,1,1,1,0,1,1,0,1


In [26]:
df.dtypes

Pass                      bool
Years of Service         int64
num_authors              int64
delta_days               int64
scope_national            bool
majority_bloc             bool
len_desc                 int64
Party_Bagumbayan-VNP     uint8
Party_Independent        uint8
Party_LDP                uint8
Party_Lakas              uint8
Party_Lakas-CMD          uint8
Party_Lakas-Kampi-CMD    uint8
Party_Liberal            uint8
Party_NPC                uint8
Party_Nacionalista       uint8
Party_PDP_Laban          uint8
Party_PMP                uint8
Party_PRP                uint8
Party_UNA                uint8
month_August             uint8
month_December           uint8
month_February           uint8
month_January            uint8
month_July               uint8
month_June               uint8
month_March              uint8
month_May                uint8
month_November           uint8
month_October            uint8
month_September          uint8
t_0                      int64
t_1     

## Word Vectorizer

In [None]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_v = tfidf.fit_transform(df_billText.long_title)

In [None]:
import pickle

In [None]:
pickle.dump(tfidf, open("tfidf.pkl", "wb"))

In [None]:
data_dtm = pd.DataFrame(tfidf_v.toarray(), columns=tfidf.get_feature_names())
data_dtm.index = df_billText.index

In [None]:
df_vect_words = pd.DataFrame(sorted(tfidf.vocabulary_.items(), key=lambda kv: kv[1], reverse=True),
                  columns = ['Word', 'Weight'])

In [None]:
df_vect_words.to_csv('vectorizer_voc.csv')

In [None]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [None]:
df_billText.to_csv('billText_Processed_n.csv')