# Counting Words

In [None]:
import re
"""Count words."""

def count_words(text):
    
    """Count how many times each unique word occurs in text."""
    counts = dict()
    
    #turn to lowercase
    lowertxt=text.islower()
    
    #split by using regex
    split = re.findall(r"[a-z\-]+", lowertxt)
    
    # aggregate word counts using a dictionary
    for word in split:
        if word in counts:
            counts[word]=counts[word] + 1

        else:
            counts[word]=1
        
    return counts  # dictionary of { <word>: <count> } pairs to return

def test_run():
    with open("input.txt", "r") as f:
        text = f.read()
        counts = count_words(text)
        sorted_counts = sorted(counts.items(), key=lambda pair: pair[1], reverse=True)
        
        print("10 most common words:\nWord\tCount")
        for word, count in sorted_counts[:10]:
            print("{}\t{}".format(word, count))
        
        print("\n10 least common words:\nWord\tCount")
        for word, count in sorted_counts[-10:]:
            print("{}\t{}".format(word, count))


if __name__ == "__main__":
    test_run()

# Text Processing

### Cleaning

In [None]:
import requests

# Fetch a web page
r = requests.get("https://news.ycombinator.com")
print(r.text)

In [None]:
import re

# Remove HTML tags using RegEx
pattern = re.compile(r'<.*?>')  # tags look like <...>
print(pattern.sub('', r.text))  # replace them with blank

In [None]:
from bs4 import BeautifulSoup

# Remove HTML tags using Beautiful Soup library
soup = BeautifulSoup(r.text, "html5lib")
print(soup.get_text())

In [None]:
# Find all articles
summaries = soup.find_all("tr", class_="athing")
summaries[0]

In [None]:
# Extract title
summaries[0].find("a", class_="storylink").get_text().strip()

In [None]:
# Find all articles, extract titles
articles = []
summaries = soup.find_all("tr", class_="athing")
for summary in summaries:
    title = summary.find("a", class_="storylink").get_text().strip()
    articles.append((title))

print(len(articles), "Article summaries found. Sample:")
print(articles[0])

### Normalization

In [None]:
# Sample text
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"
print(text)

In [None]:
# Convert to lowercase
text = text.lower() 
print(text)

### Punctuation Removal

In [None]:
import re

# Remove punctuation characters
text = re.sub(r"[^a-zA-Z0-9]", " ", text) 
print(text)

### Tokenization

In [None]:
# Split text into tokens (words)
words = text.split()
print(words)

In [None]:
#Using NLTK
import os
import nltk
nltk.download('punkt')
nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data"))

from nltk.tokenize import word_tokenize

# Split text into words using NLTK
words = word_tokenize(text)
print(words)

In [None]:
from nltk.tokenize import sent_tokenize

# Split text into sentences
sentences = sent_tokenize(text)
print(sentences)

In [None]:
# List stop words
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words("english"))

### Sentence Parsing

In [None]:
import nltk

# Define a custom grammar
my_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
parser = nltk.ChartParser(my_grammar)

# Parse a sentence
sentence = word_tokenize("I shot an elephant in my pajamas")
for tree in parser.parse(sentence):
    print(tree)

### Stemming and Lemmatization

In [None]:
from nltk.stem.porter import PorterStemmer

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

In [None]:
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

# Reduce words to their root form
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)

In [None]:
# Lemmatize verbs by specifying pos (part of speech)
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in lemmed] #pos=v means verb
print(lemmed)