In [None]:
### Processing Raw Text

## Let's try on some things first:
## convert text to lowercase - notice that you don't need any modules for this


input_str = 'The 5 biggest countries by population in 2017 are China, India, United States, Indonesia, and Brazil.'
input_str = input_str.lower()
print(input_str)

In [None]:
## let's remove numbers in a text

import re
input_str = 'Box A contains 3 red and 5 white balls, while Box B contains 4 red and 2 blue balls.'
result = re.sub(r'\d+', '', input_str)
print(result)

Box A contains  red and  white balls, while Box B contains  red and  blue balls.


In [None]:
## let's remove punctuation

## In simple terms, maketrans() method is a static method that creates a one to one mapping of a character to its translation/replacement.
## It creates a Unicode representation of each character for translation.
## This translation mapping is then used for replacing a character to its mapped character when used in translate() method.

import string
input_str = 'This &is [an] example? {of} string. with.? punctuation!!!!'
result = input_str.translate(str.maketrans('','', string.punctuation))
print(result)

This is an example of string with punctuation


In [None]:
## another example of the use of translate()

intab = "aeiou"
outtab = "12345"
trantab = str.maketrans(intab, outtab)

str = "this is string example....wow!!!"
print (str.translate(trantab))

th3s 3s str3ng 2x1mpl2....w4w!!!


In [None]:
## remove white spaces

input_str = ' \t a string example\t '
input_str = input_str.strip()
input_str

'a string example'

In [None]:
## remove stop words

import nltk
from nltk.corpus import stopwords

input_str = 'NLTK is a leading platform for building Python programs to work with human language data.'
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
tokens = word_tokenize(input_str)
result = [i for i in tokens if not i in stop_words]
print (result)

['NLTK', 'leading', 'platform', 'building', 'Python', 'programs', 'work', 'human', 'language', 'data', '.']


In [None]:
## remove stopwords without NLTK

import spacy ## use pip install or conda install if you don't already have it imported
import sklearn

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
## stemming

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer= PorterStemmer()
input_str='There are several types of stemming algorithms.'
input_str=word_tokenize(input_str)
for word in input_str:
    print(stemmer.stem(word))

there
are
sever
type
of
stem
algorithm
.


In [None]:
## lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer=WordNetLemmatizer()
input_str= 'been had done languages cities mice'
input_str=word_tokenize(input_str)
for word in input_str:
    print(lemmatizer.lemmatize(word))

been
had
done
language
city
mouse


In [None]:
## parts of speech (POS)

import textblob ## use pip install or conda install if you don't already have it imported

input_str= 'Parts of speech examples: an article, to write, interesting, easily, and, of'
from textblob import TextBlob
result = TextBlob(input_str)
print(result.tags)

[('Parts', 'NNS'), ('of', 'IN'), ('speech', 'NN'), ('examples', 'NNS'), ('an', 'DT'), ('article', 'NN'), ('to', 'TO'), ('write', 'VB'), ('interesting', 'VBG'), ('easily', 'RB'), ('and', 'CC'), ('of', 'IN')]


In [None]:
## chunking (shallow parsing)

input_str= 'A black television and a white stove were bought for the new apartment of John.'
from textblob import TextBlob
result = TextBlob(input_str)
print(result.tags) ## run these separately

reg_exp = 'NP: {<DT>?<JJ>*<NN>}'
rp = nltk.RegexpParser(reg_exp)
result = rp.parse(result.tags)
print(result)

[('A', 'DT'), ('black', 'JJ'), ('television', 'NN'), ('and', 'CC'), ('a', 'DT'), ('white', 'JJ'), ('stove', 'NN'), ('were', 'VBD'), ('bought', 'VBN'), ('for', 'IN'), ('the', 'DT'), ('new', 'JJ'), ('apartment', 'NN'), ('of', 'IN'), ('John', 'NNP')]
(S
  (NP A/DT black/JJ television/NN)
  and/CC
  (NP a/DT white/JJ stove/NN)
  were/VBD
  bought/VBN
  for/IN
  (NP the/DT new/JJ apartment/NN)
  of/IN
  John/NNP)


In [None]:
## named entity recognition

from nltk import word_tokenize, pos_tag, ne_chunk
input_str = 'Bill works for Apple so he went to Boston for a conference.'
print (ne_chunk(pos_tag(word_tokenize(input_str))))

(S
  (PERSON Bill/NNP)
  works/VBZ
  for/IN
  Apple/NNP
  so/IN
  he/PRP
  went/VBD
  to/TO
  (GPE Boston/NNP)
  for/IN
  a/DT
  conference/NN
  ./.)


In [1]:
from __future__ import division ## in your textbook this is specified for python2 users only, but actually it is working with python3 as well
import nltk, re, pprint
from nltk import word_tokenize

In [2]:
### Let's look at electronic books

from urllib import request
url = 'http://www.gutenberg.org/files/2554/2554-0.txt'
response = request.urlopen(url)
raw = response.read().decode('utf8')
type(raw)  ## you need to run each of these separately

len(raw)

raw[:75]

## why is this output \ufeff and \r ?

'*** START OF THE PROJECT GUTENBERG EBOOK 2554 ***\n\n\n\n\nCRIME AND PUNISHMENT\n'

In [3]:
## we want to break up the string into words and punctuation, as we saw above.
## This step is called tokenization, and it produces our familiar structure, a list of words and punctuation.

tokens = word_tokenize(raw) ## make sure you run each of these separately

len(tokens)

tokens[:10]

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
text = nltk.Text(tokens)

text[1024:1062]

['an',
 'exceptionally',
 'hot',
 'evening',
 'early',
 'in',
 'July',
 'a',
 'young',
 'man',
 'came',
 'out',
 'of',
 'the',
 'garret',
 'in',
 'which',
 'he',
 'lodged',
 'in',
 'S.',
 'Place',
 'and',
 'walked',
 'slowly',
 ',',
 'as',
 'though',
 'in',
 'hesitation',
 ',',
 'towards',
 'K.',
 'bridge',
 '.',
 'He',
 'had',
 'successfully']