## Chapter 3. Processing Raw Text

In [10]:
%pip install feedparser




In [11]:
#required libraries
import nltk, re, pprint
from nltk import word_tokenize #a word tokenizer
from urllib import request #read in file from url
from bs4 import BeautifulSoup #removing html from text
import feedparser #access content of a blog

In [12]:
# Access "crime punishment" text from Project Gutenberg
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw_text = response.read().decode('utf8')
len(raw_text) #our text has 1,176,812 characters

1176812

*Tokenization* is the process of splitting a string of text into a list of words and throwing away some characters such as punctuations. 

In [13]:
# Tokenizing our text
tokens = word_tokenize(raw_text)
print("the raw text:", type(raw_text))
print("after tokenizing the raw text:", type(tokens))
print(len(raw_text))

the raw text: <class 'str'>
after tokenizing the raw text: <class 'list'>
1176812


In [14]:
tokens[:5]

['\ufeffThe', 'Project', 'Gutenberg', 'eBook', 'of']

In [15]:
text = nltk.Text(tokens)
text.collocations() #pairs that usually occur together in the text


Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
great deal; young man; Nikodim Fomitch; Project Gutenberg; Ilya
Petrovitch; Andrey Semyonovitch; Hay Market; Dmitri Prokofitch; Good
heavens


### Dealing with HTML

In [16]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode("utf8")
html[:60]


'<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN'

In [17]:
raw = BeautifulSoup(html, 'html.parser').get_text()
token = word_tokenize(raw)
token = token[110:390]
text = nltk.Text(token)
text.concordance('gene') #searchin word 'gene' and surrounding context

Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin


### Processing RSS Feeds

In [56]:
#importing libraries
from bs4 import BeautifulSoup
import feedparser

#read in blog feed
all_feed = feedparser.parse("https://eugenebaraka.github.io/feed.xml")
print("BLOG TITLE:", all_feed['feed']['title']) #blog title
print("NUMBER OF POSTS:", len(all_feed.entries), "posts") #number of posts so far

latest_post = all_feed.entries[:-1][0] #the latest post
print("LATEST POST'S TITLE:", latest_post.title) #title of my latest blog post

# get the post content, parse and tokenize it
post_content = latest_post.content[0].value 
raw_text = BeautifulSoup(post_content, "html.parser").get_text()
tokenized_text = word_tokenize(raw_text)

print("BEFORE TOKENIZATION:", raw_text[:50])
print("AFTER TOKENIZATION:", tokenized_text[:15])

BLOG TITLE: Eugene Baraka
NUMBER OF POSTS: 3 posts
LATEST POST'S TITLE: Understanding Population Variance and Sample Variance Formulas
BEFORE TOKENIZATION: Understanding Population Variance and Sample Varia
AFTER TOKENIZATION: ['Understanding', 'Population', 'Variance', 'and', 'Sample', 'Variance', 'Formulas', 'In', 'my', 'data', 'science', 'learning', 'journey', ',', 'I']


In [44]:
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
latest_post

{'title': 'Understanding Population Variance and Sample Variance Formulas',
 'title_detail': {'type': 'text/html',
  'language': None,
  'base': 'https://eugenebaraka.github.io/feed.xml',
  'value': 'Understanding Population Variance and Sample Variance Formulas'},
 'links': [{'href': 'https://eugenebaraka.github.io/blog/2022/07/03/variance-explained/',
   'rel': 'alternate',
   'type': 'text/html',
   'title': 'Understanding Population Variance and Sample Variance Formulas'}],
 'link': 'https://eugenebaraka.github.io/blog/2022/07/03/variance-explained/',
 'published': '2022-07-03T00:00:00+00:00',
 'published_parsed': time.struct_time(tm_year=2022, tm_mon=7, tm_mday=3, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=6, tm_yday=184, tm_isdst=0),
 'updated': '2022-07-03T00:00:00+00:00',
 'updated_parsed': time.struct_time(tm_year=2022, tm_mon=7, tm_mday=3, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=6, tm_yday=184, tm_isdst=0),
 'id': 'https://eugenebaraka.github.io/blog/2022/07/03/variance-explained'

In [8]:
wordlist = ['input1', 'output1', 'input2', 'output']
[w for w in wordlist if re.search(".*[0-9]+$", w)] #all strings ending with number


['input1', 'output1', 'input2']