# Text Processing – Long Titles from Philippines Senate Bills 

In [35]:
# Hide Warning messages
import warnings
warnings.filterwarnings('ignore')

## Setting Up Pandas Options

In [36]:
# For loading, manipulating dataframe.
import pandas as pd
import numpy as np

In [37]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.set_option('display.max_colwidth', -1)

## Loading the Data

In [38]:
df_congress = pd.read_csv('congress_data_set.csv',index_col=0)
print(f"df_congress has {df_congress.shape[0]:,} rows by {df_congress.shape[1]} columns.")
df_congress.head(1)

df_congress has 15,078 rows by 22 columns.


Unnamed: 0,bill_id,num,congress,long_title,date_filed,scope,status,author,date_lastUpdate,passed,Full Name Primary Author,Party,Bloc,Years of Service,num_authors,delta_days,upper,mon,quarter,scope_national,majority_bloc,len_desc
0,17SBN-2235,SBN-2235,17,an act establishing the fiscal regime for the mining industry,2019-05-28,National,"Pending Second Reading, Special Order","Drilon, Franklin M., Recto, Ralph G., Sotto III, Vicente C., Angara, Juan Edgardo ""Sonny"" M.",2019-05-28,False,Franklin Drilon,Liberal,Minority,24.0,4,0.0,False,5,2.0,True,False,61


In [39]:
df_congress['bill_status'] = np.where(df_congress.passed==True,"Passed",
                                     np.where(df_congress.status=='Sent to the Archives',"Archived","Pending")
                                     )

In [40]:
df_congress.bill_status.value_counts(dropna=False)

Pending     14549
Passed      438  
Archived    91   
Name: bill_status, dtype: int64

## Cleaning Data

In [41]:
df_billText = df_congress[['bill_id','long_title', 'bill_status']]
df_billText.head(20)

Unnamed: 0,bill_id,long_title,bill_status
0,17SBN-2235,an act establishing the fiscal regime for the mining industry,Pending
1,17SBN-2234,"an act authorizing the sale of certain parcels of land in barangay krus na ligas, quezon city by the university of the philippines to the quezon city government amending for the purpose republic act no. 9500, otherwise known as the university of the philippines charter of 2008 and for other purposes",Passed
2,17SBN-2233,"an act increasing the excise tax on tobacco products, the penalties for violations of provisions on articles subject to excise tax, and earmarking incremental tobacco excise tax for human resource development programs for health professionals, amending for this purpose sections 145, 164, 260, 262, 263, 265 and 288(c) of the national internal revenue code of 1997, as amended by republic act no. 10963, and for other purposes",Passed
3,17SBN-2232,"an act mandating the institutionalization, development, training, organization and administration of basic reserve officers training corps (rotc) in grades 11 and 12 in public and private educational institutions, further amending republic act no. 7077 , otherwise known as the citizen armed force or armed forces of the philippines reservist act , as amended by republic act no. 9163 , otherwise known as the national service training program and appropriating funds therefor",Pending
4,17SBN-2231,an act strengthening the al-amanah islamic bank,Pending
5,17SBN-2230,"an act designating the 3rd sunday of november annually as the philippine day of remembrance for road crash victims, survivors and families",Pending
6,17SBN-2229,"an act designating the third sunday of november as the philippine day of remembrance for road crash victims, survivors and families",Pending
7,17SBN-2228,"an act declaring the balanga wetland and nature park located in barangay tortugas, city of balanga, province of bataan as a responsible, community-based ecotourism zone and appropriating funds therefor",Passed
8,17SBN-2227,"an act amending republic act no. 7042, otherwise known as the foreign investments act of 1991, as amended by republic act no. 8179, and for other purposes",Pending
9,17SBN-2226,"an act allowing disclosure of deposits, repealing for the purpose the pertinent laws on secrecy of deposits",Pending


### Punctuations and Numbers Removal

In [42]:
import re, string

In [43]:
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text
round1 = lambda x: clean_text_round1(x)

In [44]:
# Let's take a look at the updated text
df_billText['long_title'] = df_billText.long_title.apply(round1)
df_billText.head(20)

Unnamed: 0,bill_id,long_title,bill_status
0,17SBN-2235,an act establishing the fiscal regime for the mining industry,Pending
1,17SBN-2234,an act authorizing the sale of certain parcels of land in barangay krus na ligas quezon city by the university of the philippines to the quezon city government amending for the purpose republic act no otherwise known as the university of the philippines charter of and for other purposes,Passed
2,17SBN-2233,an act increasing the excise tax on tobacco products the penalties for violations of provisions on articles subject to excise tax and earmarking incremental tobacco excise tax for human resource development programs for health professionals amending for this purpose sections and of the national internal revenue code of as amended by republic act no and for other purposes,Passed
3,17SBN-2232,an act mandating the institutionalization development training organization and administration of basic reserve officers training corps rotc in grades and in public and private educational institutions further amending republic act no otherwise known as the citizen armed force or armed forces of the philippines reservist act as amended by republic act no otherwise known as the national service training program and appropriating funds therefor,Pending
4,17SBN-2231,an act strengthening the alamanah islamic bank,Pending
5,17SBN-2230,an act designating the sunday of november annually as the philippine day of remembrance for road crash victims survivors and families,Pending
6,17SBN-2229,an act designating the third sunday of november as the philippine day of remembrance for road crash victims survivors and families,Pending
7,17SBN-2228,an act declaring the balanga wetland and nature park located in barangay tortugas city of balanga province of bataan as a responsible communitybased ecotourism zone and appropriating funds therefor,Passed
8,17SBN-2227,an act amending republic act no otherwise known as the foreign investments act of as amended by republic act no and for other purposes,Pending
9,17SBN-2226,an act allowing disclosure of deposits repealing for the purpose the pertinent laws on secrecy of deposits,Pending


In [45]:
df_billText.to_pickle("clean_billtext.pkl")

### Tokenization of Words

In [46]:
# Text Preprocessing
from nltk import word_tokenize

In [47]:
df_billText['long_title'] = df_billText.long_title.map(word_tokenize)
df_billText.head(10)

Unnamed: 0,bill_id,long_title,bill_status
0,17SBN-2235,"[an, act, establishing, the, fiscal, regime, for, the, mining, industry]",Pending
1,17SBN-2234,"[an, act, authorizing, the, sale, of, certain, parcels, of, land, in, barangay, krus, na, ligas, quezon, city, by, the, university, of, the, philippines, to, the, quezon, city, government, amending, for, the, purpose, republic, act, no, otherwise, known, as, the, university, of, the, philippines, charter, of, and, for, other, purposes]",Passed
2,17SBN-2233,"[an, act, increasing, the, excise, tax, on, tobacco, products, the, penalties, for, violations, of, provisions, on, articles, subject, to, excise, tax, and, earmarking, incremental, tobacco, excise, tax, for, human, resource, development, programs, for, health, professionals, amending, for, this, purpose, sections, and, of, the, national, internal, revenue, code, of, as, amended, by, republic, act, no, and, for, other, purposes]",Passed
3,17SBN-2232,"[an, act, mandating, the, institutionalization, development, training, organization, and, administration, of, basic, reserve, officers, training, corps, rotc, in, grades, and, in, public, and, private, educational, institutions, further, amending, republic, act, no, otherwise, known, as, the, citizen, armed, force, or, armed, forces, of, the, philippines, reservist, act, as, amended, by, republic, act, no, otherwise, known, as, the, national, service, training, program, and, appropriating, funds, therefor]",Pending
4,17SBN-2231,"[an, act, strengthening, the, alamanah, islamic, bank]",Pending
5,17SBN-2230,"[an, act, designating, the, sunday, of, november, annually, as, the, philippine, day, of, remembrance, for, road, crash, victims, survivors, and, families]",Pending
6,17SBN-2229,"[an, act, designating, the, third, sunday, of, november, as, the, philippine, day, of, remembrance, for, road, crash, victims, survivors, and, families]",Pending
7,17SBN-2228,"[an, act, declaring, the, balanga, wetland, and, nature, park, located, in, barangay, tortugas, city, of, balanga, province, of, bataan, as, a, responsible, communitybased, ecotourism, zone, and, appropriating, funds, therefor]",Passed
8,17SBN-2227,"[an, act, amending, republic, act, no, otherwise, known, as, the, foreign, investments, act, of, as, amended, by, republic, act, no, and, for, other, purposes]",Pending
9,17SBN-2226,"[an, act, allowing, disclosure, of, deposits, repealing, for, the, purpose, the, pertinent, laws, on, secrecy, of, deposits]",Pending


### Removal of Non-English Words

In [48]:
import nltk

In [49]:
eng_words = set(nltk.corpus.words.words())
df_billText['long_title'] = df_billText.long_title.map(lambda words: [word for word in words if word in eng_words])
df_billText.head(20)

Unnamed: 0,bill_id,long_title,bill_status
0,17SBN-2235,"[an, act, the, fiscal, regime, for, the, mining, industry]",Pending
1,17SBN-2234,"[an, act, the, sale, of, certain, of, land, in, barangay, na, ligas, city, by, the, university, of, the, to, the, city, government, for, the, purpose, republic, act, no, otherwise, known, as, the, university, of, the, charter, of, and, for, other]",Passed
2,17SBN-2233,"[an, act, increasing, the, excise, tax, on, tobacco, the, for, of, on, subject, to, excise, tax, and, incremental, tobacco, excise, tax, for, human, resource, development, for, health, for, this, purpose, and, of, the, national, internal, revenue, code, of, as, by, republic, act, no, and, for, other]",Passed
3,17SBN-2232,"[an, act, the, institutionalization, development, training, organization, and, administration, of, basic, reserve, training, corps, in, and, in, public, and, private, educational, further, republic, act, no, otherwise, known, as, the, citizen, armed, force, or, armed, of, the, reservist, act, as, by, republic, act, no, otherwise, known, as, the, national, service, training, program, and, funds, therefor]",Pending
4,17SBN-2231,"[an, act, strengthening, the, bank]",Pending
5,17SBN-2230,"[an, act, the, of, annually, as, the, day, of, remembrance, for, road, crash, and]",Pending
6,17SBN-2229,"[an, act, the, third, of, as, the, day, of, remembrance, for, road, crash, and]",Pending
7,17SBN-2228,"[an, act, the, and, nature, park, in, barangay, city, of, province, of, bataan, as, a, responsible, zone, and, funds, therefor]",Passed
8,17SBN-2227,"[an, act, republic, act, no, otherwise, known, as, the, foreign, act, of, as, by, republic, act, no, and, for, other]",Pending
9,17SBN-2226,"[an, act, disclosure, of, for, the, purpose, the, pertinent, on, secrecy, of]",Pending


### Word Lemmatizer

In [50]:
from nltk.stem import WordNetLemmatizer

In [51]:
lemmatizer = WordNetLemmatizer()
df_billText['long_title'] = df_billText.long_title.map(lambda words: 
                                                                [lemmatizer.lemmatize(word) for word in words])
df_billText.head(20)

Unnamed: 0,bill_id,long_title,bill_status
0,17SBN-2235,"[an, act, the, fiscal, regime, for, the, mining, industry]",Pending
1,17SBN-2234,"[an, act, the, sale, of, certain, of, land, in, barangay, na, ligas, city, by, the, university, of, the, to, the, city, government, for, the, purpose, republic, act, no, otherwise, known, a, the, university, of, the, charter, of, and, for, other]",Passed
2,17SBN-2233,"[an, act, increasing, the, excise, tax, on, tobacco, the, for, of, on, subject, to, excise, tax, and, incremental, tobacco, excise, tax, for, human, resource, development, for, health, for, this, purpose, and, of, the, national, internal, revenue, code, of, a, by, republic, act, no, and, for, other]",Passed
3,17SBN-2232,"[an, act, the, institutionalization, development, training, organization, and, administration, of, basic, reserve, training, corp, in, and, in, public, and, private, educational, further, republic, act, no, otherwise, known, a, the, citizen, armed, force, or, armed, of, the, reservist, act, a, by, republic, act, no, otherwise, known, a, the, national, service, training, program, and, fund, therefor]",Pending
4,17SBN-2231,"[an, act, strengthening, the, bank]",Pending
5,17SBN-2230,"[an, act, the, of, annually, a, the, day, of, remembrance, for, road, crash, and]",Pending
6,17SBN-2229,"[an, act, the, third, of, a, the, day, of, remembrance, for, road, crash, and]",Pending
7,17SBN-2228,"[an, act, the, and, nature, park, in, barangay, city, of, province, of, bataan, a, a, responsible, zone, and, fund, therefor]",Passed
8,17SBN-2227,"[an, act, republic, act, no, otherwise, known, a, the, foreign, act, of, a, by, republic, act, no, and, for, other]",Pending
9,17SBN-2226,"[an, act, disclosure, of, for, the, purpose, the, pertinent, on, secrecy, of]",Pending


### Part of Speech Tagging

In [52]:
from nltk import pos_tag

In [53]:
df_billText['long_title'] = df_billText.long_title.map(pos_tag)
df_billText['long_title'] = df_billText.long_title.map(lambda words: 
                            [word for word in words if (word[1].startswith(('N','J','R','V')))])
df_billText.head(20)

Unnamed: 0,bill_id,long_title,bill_status
0,17SBN-2235,"[(act, NN), (fiscal, JJ), (regime, NN), (mining, NN), (industry, NN)]",Pending
1,17SBN-2234,"[(act, NN), (sale, NN), (certain, JJ), (land, NN), (barangay, NN), (na, JJ), (ligas, NN), (city, NN), (university, NN), (city, NN), (government, NN), (purpose, JJ), (republic, JJ), (act, NN), (otherwise, RB), (known, VBN), (university, NN), (charter, NN), (other, JJ)]",Passed
2,17SBN-2233,"[(act, NN), (increasing, VBG), (excise, NN), (tax, NN), (tobacco, NN), (subject, NN), (excise, VB), (tax, NN), (incremental, JJ), (tobacco, NN), (excise, NN), (tax, NN), (human, JJ), (resource, NN), (development, NN), (health, NN), (purpose, NN), (national, JJ), (internal, JJ), (revenue, NN), (code, NN), (republic, JJ), (act, NN), (other, JJ)]",Passed
3,17SBN-2232,"[(act, NN), (institutionalization, NN), (development, NN), (training, NN), (organization, NN), (administration, NN), (basic, JJ), (reserve, NN), (training, VBG), (corp, NN), (public, JJ), (private, JJ), (educational, JJ), (further, RBR), (republic, JJ), (act, NN), (otherwise, RB), (known, VBN), (citizen, NN), (armed, VBD), (force, NN), (armed, VBN), (reservist, NN), (act, NN), (republic, JJ), (act, NN), (otherwise, RB), (known, VBN), (national, JJ), (service, NN), (training, VBG), (program, NN), (fund, NN), (therefor, NN)]",Pending
4,17SBN-2231,"[(act, NN), (strengthening, VBG), (bank, NN)]",Pending
5,17SBN-2230,"[(act, NN), (annually, RB), (day, NN), (remembrance, NN), (road, NN), (crash, NN)]",Pending
6,17SBN-2229,"[(act, NN), (third, JJ), (day, NN), (remembrance, NN), (road, NN), (crash, NN)]",Pending
7,17SBN-2228,"[(act, NN), (nature, NN), (park, NN), (barangay, JJ), (city, NN), (province, NN), (bataan, NN), (responsible, JJ), (zone, NN), (fund, NN), (therefor, NN)]",Passed
8,17SBN-2227,"[(act, NN), (republic, JJ), (act, NN), (otherwise, RB), (known, VBN), (foreign, JJ), (act, NN), (republic, JJ), (act, NN), (other, JJ)]",Pending
9,17SBN-2226,"[(act, JJ), (disclosure, NN), (purpose, NN), (pertinent, NN), (secrecy, NN)]",Pending


### Removal of Stop Words

In [54]:
from nltk.corpus import stopwords

In [55]:
stoppers = stopwords.words('english')
useless = ['act','presidential','code','government','provide','purpose','therefor','penal','thereof','article',
           'provision','therefore','ra','b','san','sa','decree','fund','program','national','law','republic',
           'otherwise','section','ae','biri','han','hereof','hereon','mandate','met','ni','nineteen','ninety',
           'non','papa','par','para','pia','po','poblacion','pst','pup','purpose','pia','maria','mary','rosario',
           'sangley','sec','second','secondary','secondhand','seventeen','seventeenth','seventh','seventy','si',
           'sixteen','sixteenth','sixth','sora','st','sate','stateless','statement','sub','subject','subparagraph',
           'sur','taft','tala','talisay','tecnology','tecum','therefor','therefrom','thereon','thereto','thereunder',
           'therewith','thing','thirteen','thirteenth','thirty','thousand','ti','title','toa','ton','triangle','tuba',
           'tuber','twelfth','unit','usage','use','used','useful','user','utilize','vice','wanton','whomsoever','year',
           'yearly','yer','yam','wa','unto','unsought','type','tubig','tort','thank','th','tenth','sun','subtitle','solo',
           'set','session','red','rattan','president','presidential','political','policy','plenary','plebiscite','pili',
           'pilar','peter','peso','pedro','pascual','party','panglima','pandan','palay','pablo','ordinance','order','orderly',
           'official','officer','office','oath','nipa','ninth','naga','naa','murphy','municipality','municipal','multiple',
           'mount','mosser','morong','monthly','month','monte','middle','micro','mayor','martin','marikina','march','manila',
           'madrasah','luna','fe','da','ce','bos','bogo','amang','bataan','na','ligas']
stoppers.extend(useless)

In [56]:
df_billText['long_title']=df_billText.long_title.map(lambda tags:[tag[0] for tag in tags if tag[0] not in stoppers])
df_billText.head(20)

Unnamed: 0,bill_id,long_title,bill_status
0,17SBN-2235,"[fiscal, regime, mining, industry]",Pending
1,17SBN-2234,"[sale, certain, land, barangay, city, university, city, known, university, charter]",Passed
2,17SBN-2233,"[increasing, excise, tax, tobacco, excise, tax, incremental, tobacco, excise, tax, human, resource, development, health, internal, revenue]",Passed
3,17SBN-2232,"[institutionalization, development, training, organization, administration, basic, reserve, training, corp, public, private, educational, known, citizen, armed, force, armed, reservist, known, service, training]",Pending
4,17SBN-2231,"[strengthening, bank]",Pending
5,17SBN-2230,"[annually, day, remembrance, road, crash]",Pending
6,17SBN-2229,"[third, day, remembrance, road, crash]",Pending
7,17SBN-2228,"[nature, park, barangay, city, province, responsible, zone]",Passed
8,17SBN-2227,"[known, foreign]",Pending
9,17SBN-2226,"[disclosure, pertinent, secrecy]",Pending


In [57]:
df_billText.loc[df_billText.long_title.apply(lambda x: len(x))==0,'long_title'] = np.nan
df_billText.head(20)

Unnamed: 0,bill_id,long_title,bill_status
0,17SBN-2235,"[fiscal, regime, mining, industry]",Pending
1,17SBN-2234,"[sale, certain, land, barangay, city, university, city, known, university, charter]",Passed
2,17SBN-2233,"[increasing, excise, tax, tobacco, excise, tax, incremental, tobacco, excise, tax, human, resource, development, health, internal, revenue]",Passed
3,17SBN-2232,"[institutionalization, development, training, organization, administration, basic, reserve, training, corp, public, private, educational, known, citizen, armed, force, armed, reservist, known, service, training]",Pending
4,17SBN-2231,"[strengthening, bank]",Pending
5,17SBN-2230,"[annually, day, remembrance, road, crash]",Pending
6,17SBN-2229,"[third, day, remembrance, road, crash]",Pending
7,17SBN-2228,"[nature, park, barangay, city, province, responsible, zone]",Passed
8,17SBN-2227,"[known, foreign]",Pending
9,17SBN-2226,"[disclosure, pertinent, secrecy]",Pending


In [58]:
df_billText.long_title.isna().sum()

127

In [59]:
df_billText = df_billText.dropna(axis=0)
df_billText.shape

(14951, 3)

In [60]:
df_billText['long_title'] = df_billText.long_title.map(lambda words: 
                                                                [lemmatizer.lemmatize(word) for word in words])

In [61]:
df_billText['long_title'] = df_billText.long_title.apply(lambda x: ' '.join(x))
df_billText.head(2)

Unnamed: 0,bill_id,long_title,bill_status
0,17SBN-2235,fiscal regime mining industry,Pending
1,17SBN-2234,sale certain land barangay city university city known university charter,Passed


## Word Vectorizer

In [62]:
# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

In [63]:
v = TfidfVectorizer(stop_words="english")
data_v = v.fit_transform(df_billText.long_title)

In [64]:
data_dtm = pd.DataFrame(data_v.toarray(), columns=v.get_feature_names())
data_dtm.index = df_billText.index

In [65]:
df_vect_words = pd.DataFrame(sorted(v.vocabulary_.items(), key=lambda kv: kv[1], reverse=True),
                  columns = ['Word', 'Weight'])

In [66]:
df_vect_words.to_csv('vectorizer_voc.csv')

In [67]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [68]:
df_billText.to_csv('billText_Processed.csv')