# Text Processing

## Capturing Text Data

### Plain Text

In [None]:
import os

# Read in a plain text file
with open(os.path.join("data", "hieroglyph.txt"), "r") as f:
    text = f.read()
    print(text)

### Tabular Data

In [None]:
import pandas as pd

# Extract text column from a dataframe
df = pd.read_csv(os.path.join("data", "news.csv"))
df.head()[['publisher', 'title']]

# Convert text column to lowercase
df['title'] = df['title'].str.lower()
df.head()[['publisher', 'title']]

### Online Resource

In [None]:
import requests
import json

# Fetch data from a REST API
r = requests.get(
    "https://quotes.rest/qod.json")
res = r.json()
print(json.dumps(res, indent=4))

# Extract relevant object and field
q = res["contents"]["quotes"][0]
print(q["quote"], "\n--", q["author"])

## Cleaning

In [1]:
import requests

# Fetch a web page
r = requests.get("https://news.ycombinator.com")
print(r.text)

<html op="news"><head><meta name="referrer" content="origin"><meta name="viewport" content="width=device-width, initial-scale=1.0"><link rel="stylesheet" type="text/css" href="news.css?xDJZ1aWhiD4MZBrpGsuq">
            <link rel="shortcut icon" href="favicon.ico">
          <link rel="alternate" type="application/rss+xml" title="RSS" href="rss">
        <title>Hacker News</title></head><body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">
        <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" width="100%" style="padding:2px"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img src="y18.gif" width="18" height="18" style="border:1px white solid;"></a></td>
                  <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
              <a href="newest">new</a> | <a href="front">past</a> | <a href="newco

In [2]:
import re

# Remove HTML tags using RegEx
pattern = re.compile(r'<.*?>')  # tags look like <...>
print(pattern.sub('', r.text))  # replace them with blank


            
          
        Hacker News
        
                  Hacker News
              new | past | comments | ask | show | jobs | submit            
                              login
                          
              

              
      1.      Advanced Data Structures (mit.edu)
        315 points by rjammala 2 hours ago  | hide | 27&nbsp;comments              
      
                
      2.      Google to restrict modern ad blocking Chrome extensions to enterprise users (9to5google.com)
        221 points by estranhosidade 3 hours ago  | hide | 119&nbsp;comments              
      
                
      3.      Roc – Real-Time streaming over the network (gavv.github.io)
        33 points by gavv42 1 hour ago  | hide | 9&nbsp;comments              
      
                
      4.      Exponential economist meets finite physicist (ucsd.edu)
        42 points by chepaslaaa 2 hours ago  | hide | 26&nbsp;comments              
      
                
      5.  

In [3]:
from bs4 import BeautifulSoup

# Remove HTML tags using Beautiful Soup library
soup = BeautifulSoup(r.text, "html5lib")
print(soup.get_text())


            
          
        Hacker News
        
                  Hacker News
              new | past | comments | ask | show | jobs | submit            
                              login
                          
              

              
      1.      Advanced Data Structures (mit.edu)
        315 points by rjammala 2 hours ago  | hide | 27 comments              
      
                
      2.      Google to restrict modern ad blocking Chrome extensions to enterprise users (9to5google.com)
        221 points by estranhosidade 3 hours ago  | hide | 119 comments              
      
                
      3.      Roc – Real-Time streaming over the network (gavv.github.io)
        33 points by gavv42 1 hour ago  | hide | 9 comments              
      
                
      4.      Exponential economist meets finite physicist (ucsd.edu)
        42 points by chepaslaaa 2 hours ago  | hide | 26 comments              
      
                
      5.      0x Launch Kit – 

In [4]:
# Find all articles
summaries = soup.find_all("tr", class_="athing")
summaries[0]

<tr class="athing" id="20044876">
      <td align="right" class="title" valign="top"><span class="rank">1.</span></td>      <td class="votelinks" valign="top"><center><a href="vote?id=20044876&amp;how=up&amp;goto=news" id="up_20044876"><div class="votearrow" title="upvote"></div></a></center></td><td class="title"><a class="storylink" href="https://courses.csail.mit.edu/6.851/fall17/">Advanced Data Structures</a><span class="sitebit comhead"> (<a href="from?site=mit.edu"><span class="sitestr">mit.edu</span></a>)</span></td></tr>

In [None]:
summaries[0].select_one("h3 a").get_text().strip()
summaries[0].select_one("div[data-course-short-summary]").get_texxt().strip()

In [5]:
# Extract title
summaries[0].find("a", class_="storylink").get_text().strip()

'Advanced Data Structures'

In [6]:
# Find all articles, extract titles
articles = []
summaries = soup.find_all("tr", class_="athing")
for summary in summaries:
    title = summary.find("a", class_="storylink").get_text().strip()
    articles.append((title))

print(len(articles), "Article summaries found. Sample:")
print(articles[0])

30 Article summaries found. Sample:
Advanced Data Structures


## Normalization

### Case Normalization

In [None]:
# Sample text
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"
print(text)

In [None]:
# Convert to lowercase
text = text.lower() 
print(text)

### Punctuation Removal

In [None]:
import re

# Remove punctuation characters
text = re.sub(r"[^a-zA-Z0-9]", " ", text) 
print(text)

## Tokenization

In [None]:
# Split text into tokens (words)
words = text.split()
print(words)

### NLTK: Natural Language ToolKit

In [27]:
import os
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data"))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ckd16\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ckd16\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [28]:
# Another sample text
text = "Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers."
print(text)

Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers.


In [29]:
from nltk.tokenize import word_tokenize

# Split text into words using NLTK
words = word_tokenize(text)
print(words)

['Dr.', 'Smith', 'graduated', 'from', 'the', 'University', 'of', 'Washington', '.', 'He', 'later', 'started', 'an', 'analytics', 'firm', 'called', 'Lux', ',', 'which', 'catered', 'to', 'enterprise', 'customers', '.']


In [31]:
from nltk.tokenize import sent_tokenize

# Split text into sentences
sentences = sent_tokenize(text)
print(sentences)

['Dr. Smith graduated from the University of Washington.', 'He later started an analytics firm called Lux, which catered to enterprise customers.']


In [32]:
# List stop words
from nltk.corpus import stopwords
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
# Reset text
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"

# Normalize it
text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

# Tokenize it
words = text.split()
print(words)

In [33]:
# Remove stop words
words = [w for w in words if w not in stopwords.words("english")]
print(words)

['Dr.', 'Smith', 'graduated', 'University', 'Washington', '.', 'He', 'later', 'started', 'analytics', 'firm', 'called', 'Lux', ',', 'catered', 'enterprise', 'customers', '.']


### Sentence Parsing

In [None]:
import nltk

# Define a custom grammar
my_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
parser = nltk.ChartParser(my_grammar)

# Parse a sentence
sentence = word_tokenize("I shot an elephant in my pajamas")
for tree in parser.parse(sentence):
    print(tree)

In [15]:
import os
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data"))
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

my_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
parser = nltk.ChartParser(my_grammar)

sentence = word_tokenize("I shot an elephant in my pajamas")

for tree in parser.parse(sentence):
    print (tree)
#     tree.draw()

pos_tag(sentence)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ckd16\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ckd16\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ckd16\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('I', 'PRP'),
 ('shot', 'VBP'),
 ('an', 'DT'),
 ('elephant', 'NN'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('pajamas', 'NN')]

In [None]:
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize

## Stemming & Lemmatization

### Stemming

In [19]:
from nltk.stem.porter import PorterStemmer

words = ['first', 'time', 'see', 'second', 'change',]
# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['first', 'time', 'see', 'second', 'chang']


### Lemmatization

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer

# Reduce words to their root form
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)

In [None]:
# Lemmatize verbs by specifying pos
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in lemmed]
print(lemmed)

In [13]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize

temp=pos_tag(word_tokenize("Antonio joined Udacity Inc. in California."))
temp
# tree =ne_chunk(pos_tag(word_tokenize("Antonio joined Udacity Inc. in California.")))
# tree.draw()

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\ckd16\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ckd16\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


[('Antonio', 'NNP'),
 ('joined', 'VBD'),
 ('Udacity', 'NNP'),
 ('Inc.', 'NNP'),
 ('in', 'IN'),
 ('California', 'NNP'),
 ('.', '.')]