In [1]:
# HTML scrapping processing
from urllib import request
from bs4 import BeautifulSoup

# RSS 
import feedparser

# NLP
import nltk as nlp

### Book file

In [2]:
# url with the data
url = "http://www.gutenberg.org/files/2554/2554.txt"

# request to the url
response = request.urlopen(url)

# reading the response on utf8 
raw = response.read().decode('utf8')

In [3]:
# glimpse into the text
raw[:20]

'The Project Gutenber'

In [4]:
# generating tokens -> bring the strings into words and punctuations 

tokens = nlp.word_tokenize(raw)

In [5]:
# because the book include details about the author, summary, etc.. We need to trim the data to get only the 
# desired content -> most of this job is performed mannualy

begin = raw.find("PART I")

end = raw.rfind("End of Project Gutenberg's Crime")

raw = raw[begin:end]

### HTML

In [6]:
# getting URL
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"

# request/response reading on utf8
html = request.urlopen(url).read().decode('utf8')

In [7]:
# getting the data into raw format-string
raw = BeautifulSoup(html, "html5lib").get_text()

# generate the tokens
#tokens = word_tokenize(raw)

### RSS 

In [8]:
# using the url, parsing all the data based on RSS format 
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")

In [9]:
# the data format still contain the xml DOMs
llog['feed']['title']

'Language Log'

In [10]:
llog['feed']['title']

'Language Log'

In [11]:
# getting one independet post
post = llog.entries[2]

In [12]:
post

{'author': 'Mark Liberman',
 'author_detail': {'href': 'http://ling.upenn.edu/~myl',
  'name': 'Mark Liberman'},
 'authors': [{'href': 'http://ling.upenn.edu/~myl', 'name': 'Mark Liberman'}],
 'content': [{'base': 'http://languagelog.ldc.upenn.edu/nll/?p=33344',
   'language': 'en-US',
   'type': 'text/html',
   'value': '<div id="fb-root"></div>\n<p>&#8220;<a href="http://www.espn.com/nfl/story/_/id/19678652/supreme-court-gives-washington-redskins-boost-name-fight" target="_blank" rel="noopener">Supreme Court rules government can&#8217;t refuse disparaging trademarks</a>&#8220;, ESPN:</p>\n<p style="padding-left: 30px;"><span style="color: #000080;">The Supreme Court on Monday struck down part of a law that bans offensive trademarks in a ruling that is expected to help the Redskins in their legal fight over the team name. </span></p>\n<p style="padding-left: 30px;"><span style="color: #000080;">The justices ruled that the 71-year-old trademark law barring disparaging terms infringes f

In [13]:
# referent to the content we want
content = post.content[0].value

In [14]:
# using beautifulsoap to remove the html DOMS/tags from our text
raw = BeautifulSoup(content, 'html5lib').get_text()

In [16]:
# getting tokens 
tokens = nlp.word_tokenize(raw)

In [17]:
tokens

['“',
 'Supreme',
 'Court',
 'rules',
 'government',
 'can',
 '’',
 't',
 'refuse',
 'disparaging',
 'trademarks',
 '“',
 ',',
 'ESPN',
 ':',
 'The',
 'Supreme',
 'Court',
 'on',
 'Monday',
 'struck',
 'down',
 'part',
 'of',
 'a',
 'law',
 'that',
 'bans',
 'offensive',
 'trademarks',
 'in',
 'a',
 'ruling',
 'that',
 'is',
 'expected',
 'to',
 'help',
 'the',
 'Redskins',
 'in',
 'their',
 'legal',
 'fight',
 'over',
 'the',
 'team',
 'name',
 '.',
 'The',
 'justices',
 'ruled',
 'that',
 'the',
 '71-year-old',
 'trademark',
 'law',
 'barring',
 'disparaging',
 'terms',
 'infringes',
 'free',
 'speech',
 'rights',
 '.',
 'The',
 'ruling',
 'is',
 'a',
 'victory',
 'for',
 'the',
 'Asian-American',
 'rock',
 'band',
 'called',
 'the',
 'Slants',
 ',',
 'but',
 'the',
 'case',
 'was',
 'closely',
 'watched',
 'for',
 'the',
 'impact',
 'it',
 'would',
 'have',
 'on',
 'the',
 'separate',
 'dispute',
 'involving',
 'the',
 'Washington',
 'football',
 'team',
 '.',
 'The',
 'opinion',
 '

### Local Files

In [None]:
# the whole process is performed in python and once we have our raw text the tokenization follows the above
# examples

f = open('aeneid.txt')
raw = f.read()

for line in f:
    print(line.strip())

In [None]:
raw

In [None]:
# We can use nltk to remove /n from our text, once that in python we need to perform .strip manually on each line

path = nlp.data.find('aeneid.txt')

#raw = open(path, 'rU').read()