In [4]:
import nltk
import re
import string
import json
from nltk import FreqDist
from nltk import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer

# Load data
def load_txt(text):
    # open the file as read only
    file = open(text,'r')
    # read all text
    txt = file.read()
    # close the file
    file.close()
    return txt

dt = load_txt('marcosspeech.txt')
print(dt)

I have come to report to you on a nation transformed. Twelve months ago, I rendered to you and to the people an account of how we began to translate the Filipino dream of national greatness into reality. The central theme of that report was the achieving spirit that animated the vigorous partnership between the government and the people. Today, I ask you to witness with me the metamorphosis which the new achieving spirit of the Filipino people has brought about. The Crises of 1966 In 1966, when I assumed the stewardship of our government, all levels of our national life were wracked by traumatic crisis. With your help, we not only succeeded in overcoming those crises, we also began to stride forcefully and hopefully towards real progress. The Breakthrough of 1967 In 1967, the energies of the nation were spent on the construction of the physical supports of our economy and society. Our heroic efforts were rewarded by numerous breakthroughs, principally in food production, public works, 

In [5]:
len(dt)

192009

# Text Tokenization and Cleaning

* Split into Words
* Convert to lowercase.
* Remove punctuation from each token.
* Filter out remaining tokens that are not alphabetic.
* Filter out tokens that are stop words (English)

In [6]:
with open('engstopwords.json', 'r') as e:
    stopwords_eng = json.load(e)
    
def clean_txt(docs):
    # split into words
    speech_words = nltk.word_tokenize(docs)
    # convert to lower case
    lower_text = [w.lower() for w in speech_words]
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    stripped = [re_punc.sub('', w) for w in lower_text]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    words = [w for w in words if not w in  stopwords_eng]
    return words

tokens = clean_txt(dt)
print(tokens)

['report', 'nation', 'transformed', 'twelve', 'months', 'ago', 'rendered', 'people', 'account', 'began', 'translate', 'filipino', 'dream', 'national', 'greatness', 'reality', 'central', 'theme', 'report', 'achieving', 'spirit', 'animated', 'vigorous', 'partnership', 'government', 'people', 'today', 'witness', 'metamorphosis', 'achieving', 'spirit', 'filipino', 'people', 'brought', 'crises', 'assumed', 'stewardship', 'government', 'levels', 'national', 'life', 'wracked', 'traumatic', 'crisis', 'succeeded', 'overcoming', 'crises', 'began', 'stride', 'forcefully', 'real', 'progress', 'breakthrough', 'energies', 'nation', 'spent', 'construction', 'physical', 'supports', 'economy', 'society', 'heroic', 'efforts', 'rewarded', 'numerous', 'breakthroughs', 'principally', 'food', 'production', 'public', 'works', 'construction', 'education', 'year', 'expansion', 'expanded', 'scope', 'include', 'public', 'services', 'foreign', 'relations', 'domestic', 'foreign', 'commerce', 'industry', 'year', 'd

In [7]:
len(tokens)

13802

# Frequency of words

In [9]:
dist = FreqDist(tokens)
dist

FreqDist({'program': 133, 'development': 123, 'national': 113, 'government': 111, 'year': 95, 'country': 64, 'system': 58, 'projects': 55, 'administration': 54, 'public': 51, ...})

In [10]:
len(dist)

4149

# Stemming

In [12]:
porter = nltk.PorterStemmer()
Stem_words = [porter.stem(w) for w in tokens]
Stem_words

['report',
 'nation',
 'transform',
 'twelv',
 'month',
 'ago',
 'render',
 'peopl',
 'account',
 'began',
 'translat',
 'filipino',
 'dream',
 'nation',
 'great',
 'realiti',
 'central',
 'theme',
 'report',
 'achiev',
 'spirit',
 'anim',
 'vigor',
 'partnership',
 'govern',
 'peopl',
 'today',
 'wit',
 'metamorphosi',
 'achiev',
 'spirit',
 'filipino',
 'peopl',
 'brought',
 'crise',
 'assum',
 'stewardship',
 'govern',
 'level',
 'nation',
 'life',
 'wrack',
 'traumat',
 'crisi',
 'succeed',
 'overcom',
 'crise',
 'began',
 'stride',
 'forc',
 'real',
 'progress',
 'breakthrough',
 'energi',
 'nation',
 'spent',
 'construct',
 'physic',
 'support',
 'economi',
 'societi',
 'heroic',
 'effort',
 'reward',
 'numer',
 'breakthrough',
 'princip',
 'food',
 'product',
 'public',
 'work',
 'construct',
 'educ',
 'year',
 'expans',
 'expand',
 'scope',
 'includ',
 'public',
 'servic',
 'foreign',
 'relat',
 'domest',
 'foreign',
 'commerc',
 'industri',
 'year',
 'distinguish',
 'confirm',

# POS Tagging

In [11]:
nltk.pos_tag(tokens)

[('report', 'NN'),
 ('nation', 'NN'),
 ('transformed', 'VBD'),
 ('twelve', 'JJ'),
 ('months', 'NNS'),
 ('ago', 'RB'),
 ('rendered', 'VBD'),
 ('people', 'NNS'),
 ('account', 'VBP'),
 ('began', 'VBD'),
 ('translate', 'JJ'),
 ('filipino', 'NN'),
 ('dream', 'VBP'),
 ('national', 'JJ'),
 ('greatness', 'NN'),
 ('reality', 'NN'),
 ('central', 'JJ'),
 ('theme', 'JJ'),
 ('report', 'NN'),
 ('achieving', 'VBG'),
 ('spirit', 'NN'),
 ('animated', 'VBD'),
 ('vigorous', 'JJ'),
 ('partnership', 'NN'),
 ('government', 'NN'),
 ('people', 'NNS'),
 ('today', 'NN'),
 ('witness', 'JJ'),
 ('metamorphosis', 'NN'),
 ('achieving', 'VBG'),
 ('spirit', 'JJ'),
 ('filipino', 'NN'),
 ('people', 'NNS'),
 ('brought', 'VBD'),
 ('crises', 'NNS'),
 ('assumed', 'VBD'),
 ('stewardship', 'JJ'),
 ('government', 'NN'),
 ('levels', 'NNS'),
 ('national', 'JJ'),
 ('life', 'NN'),
 ('wracked', 'VBD'),
 ('traumatic', 'JJ'),
 ('crisis', 'NN'),
 ('succeeded', 'VBD'),
 ('overcoming', 'VBG'),
 ('crises', 'NNS'),
 ('began', 'VBD'),
 ('s