## Assignment 3
by Charlie Mei cm3947

In [20]:
from urllib import request
from bs4 import BeautifulSoup
from bs4.element import Comment
import re
from nltk import sent_tokenize, word_tokenize, WordNetLemmatizer, PorterStemmer, ngrams, pos_tag
from nltk.corpus import stopwords
from collections import Counter

### 1. Use urllib or requests package to read this CNBC article

In [2]:
url = 'https://www.cnbc.com/2019/01/17/netflix-price-hike-helps-disney-upcoming-streaming-service-analyst.html'

html = request.urlopen(url).read()
print(html[:100])

b'<!DOCTYPE html><html lang="en" prefix="og=https://ogp.me/ns#" itemType="https://schema.org/WebPage">'


2. Use BeautifulSoup or another HTML parsing package to extract text from the article.

In [3]:
soup = BeautifulSoup(html, 'html.parser')
# Find all instances of text data
data = soup.findAll(text=True)

In [4]:
# Extract all text from the url, using code provided by lecturer in class 3 exercise
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]', 'Comment']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

In [5]:
text = text_from_html(html)

### 3. Use re (regular expression) package to:
- Find all matches of $ amounts in the article
- Substitute all numbers with # character and print the output
- Count (using regular expressions) ”Netflix” and “Disney” mentions 

#### Find all matches of $ amounts in the article

In [6]:
matcher = re.finditer(r"\$", text)

matches = []
for match in matcher:
    matches.append(match.start())
matches


[3577, 3619]

#### Substitute all numbers with # character and print the output

In [7]:
pattern = "[0-9]"
matcher = re.finditer(pattern, text)

# Find all occurrences of numbers
matches = []
for match in matcher:
    matches.append(match.start())
print(matches)

[605, 606, 607, 608, 1290, 1291, 1292, 1293, 1491, 1492, 1571, 1572, 1574, 1575, 1576, 1577, 1579, 1581, 1582, 1608, 1609, 1611, 1612, 1613, 1614, 1616, 1618, 1619, 1969, 1971, 1972, 1974, 1975, 1977, 1978, 2026, 2027, 2571, 2573, 2833, 2834, 2948, 2950, 2951, 2953, 2954, 2956, 2957, 3104, 3105, 3119, 3120, 3529, 3578, 3579, 3580, 3620, 3621, 3622, 3654, 3655, 3821, 3822, 3841, 3842, 4313, 4314, 4315, 4316, 4414, 4415, 4416, 4417, 4569, 4570, 4571, 4572, 5815, 5816, 5817, 5818, 5934, 5935]


In [8]:
# Replace all numbers with #

# Turn text into temporary list
temp = list(text)

# Replace each match to a number with #
for position in matches:
    temp[position] = '#'
    text = "".join(temp)

text[:1000]

'× LOG IN SIGN UP Keep Me Logged In Skip Navigation SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress #### Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth

#### Count (using regular expressions) "Netflix" and "Disney" mentions

In [9]:
pattern = "Netflix"
matcher = re.finditer(pattern, text)
# Find all occurrences of numbers
matches = []
for match in matcher:
    matches.append(match.start())
print("There are " + str(len(matches)) + " mentions of Netflix.")


There are 13 mentions of Netflix.


In [10]:
pattern = "Disney"
matcher = re.finditer(pattern, text)
# Find all occurrences of numbers
matches = []
for match in matcher:
    matches.append(match.start())
print("There are " + str(len(matches)) + " mentions of Disney.")

There are 7 mentions of Disney.


### 4. Use NTLK and/or Spacy (Links to an external site.) tokenization features to:
- Tokenize sentences and words
- Remove all English stop words
- List and count n-grams for any given input n
- Lemmatize and deduplicate unigrams into a vocabulary of terms.
- Print bigrams and trigrams in the first 5 sentences
- Print POS tags in the first 5 sentences

#### Tokenize sentences and words

In [11]:
sentences = sent_tokenize(text)
for sentence in sentences[:5]:
    print(sentence + '\n')

× LOG IN SIGN UP Keep Me Logged In Skip Navigation SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress #### Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth 

In [13]:
words = word_tokenize(text)
for word in words[:20]:
    print(word)

×
LOG
IN
SIGN
UP
Keep
Me
Logged
In
Skip
Navigation
SIGN
IN
Pro
Watchlist
Make
It
Select
USA
INTL


#### Remove all English stopwords

In [18]:
# Create a set of stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords
filtered_words = [word for word in words if not word in stop_words]

# How many stopwords removed in the text?
len(words) - len(filtered_words)

234

#### List and count n-grams for any given input n

In [56]:
# Define a function that lists and counts n-grams for any n
def ngram_count(tokens, n):
    n_grams = ngrams(tokens, n)
    # List of n_grams to be dictionary keys
    n_grams_list = [' '.join(grams) for grams in n_grams]

    # Now create a frequency count dictionary
    freq_count = {}
    for gram in n_grams_list:
        if gram in freq_count:
            freq_count[gram] += 1
        else:
            freq_count[gram] = 1
    return(freq_count)


#### Lemmatize and deduplicate unigrams into a vocabulary of terms

In [86]:
unigram_terms = list(ngram_count(filtered_words, 1).keys())

# Lemmatize the unigrams
unigram_vocab = []

# Define a lemmatizer
lemmatizer = WordNetLemmatizer()

for term in unigram_terms:
    unigram_vocab.append(lemmatizer.lemmatize(term))

print(unigram_terms[200:250])
print(unigram_vocab[200:250])

['subscribers', 'generate', 'revenue', 'business', 'model', 'look', 'fundamentals', 'really', "n't", 'work', 'alleged', 'Snap', "'take", 'check', 'anyone', 'comes', 'knocking', 'expert', 'Squawk', 'Alley', 'ranges', 'positive', 'view', 'generally', 'shared', 'investment', 'community', 'profit', 'either', 'use', 'let', 'drop', 'Thursday', 'put', 'hold', 'Neflix', 'current', 'levels', 'higher', 'target', '$', 'trading', 'steady', 'around', 'midday', 'since', 'Christmas', 'Eve', 'washout', 'releases']
['subscriber', 'generate', 'revenue', 'business', 'model', 'look', 'fundamental', 'really', "n't", 'work', 'alleged', 'Snap', "'take", 'check', 'anyone', 'come', 'knocking', 'expert', 'Squawk', 'Alley', 'range', 'positive', 'view', 'generally', 'shared', 'investment', 'community', 'profit', 'either', 'use', 'let', 'drop', 'Thursday', 'put', 'hold', 'Neflix', 'current', 'level', 'higher', 'target', '$', 'trading', 'steady', 'around', 'midday', 'since', 'Christmas', 'Eve', 'washout', 'release'

#### Print bigrams and trigrams in the first 5 sentences

In [89]:
# Save the first five sentences
sentences_5 = []
for sentence in sentences[:5]:
    sentences_5.append(sentence)
    print(sentence + '\n')

× LOG IN SIGN UP Keep Me Logged In Skip Navigation SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress #### Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth 

In [90]:
def return_grams(sentence, n):
    # First word tokenize the sentence
    words = word_tokenize(sentence)
    # Parse through to previously created ngra_count function to extract ngrams and counts
    freq_counts = ngram_count(words, n)
    # Return just the ngrams
    return(list(freq_counts.keys()))

In [99]:
for i in range(len(sentences_5)):
    print("Sentence " + str(i+1) + " bigrams and trigrams: ")
    print(return_grams(sentences_5[i], 2))
    print(return_grams(sentences_5[i], 3))
    print('\n')

Sentence 1 bigrams and trigrams: 
['× LOG', 'LOG IN', 'IN SIGN', 'SIGN UP', 'UP Keep', 'Keep Me', 'Me Logged', 'Logged In', 'In Skip', 'Skip Navigation', 'Navigation SIGN', 'SIGN IN', 'IN Pro', 'Pro Watchlist', 'Watchlist Make', 'Make It', 'It Select', 'Select USA', 'USA INTL', 'INTL Markets', 'Markets Pre-Markets', 'Pre-Markets U.S.', 'U.S. Markets', 'Markets Currencies', 'Currencies Cryptocurrency', 'Cryptocurrency Futures', 'Futures &', '& Commodities', 'Commodities Bonds', 'Bonds Funds', 'Funds &', '& ETFs', 'ETFs Watchlist', 'Watchlist Business', 'Business Economy', 'Economy Finance', 'Finance Health', 'Health &', '& Science', 'Science Media', 'Media Real', 'Real Estate', 'Estate Energy', 'Energy Transportation', 'Transportation Industrials', 'Industrials Retail', 'Retail Wealth', 'Wealth Small', 'Small Business', 'Business Investing', 'Investing Invest', 'Invest In', 'In You', 'You Personal', 'Personal Finance', 'Finance Financial', 'Financial Advisors', 'Advisors Trading', 'Trad

#### Print POS tags in the first 5 sentences

In [101]:
for i in range(len(sentences_5)):
    # Word tokenize first
    words = word_tokenize(sentences_5[i])
    print("Sentence " + str(i+1) + " POS tags:")
    print(pos_tag(words))
    print('\n')

Sentence 1 POS tags:
[('×', 'JJ'), ('LOG', 'NNP'), ('IN', 'NNP'), ('SIGN', 'NNP'), ('UP', 'NNP'), ('Keep', 'NNP'), ('Me', 'NNP'), ('Logged', 'NNP'), ('In', 'IN'), ('Skip', 'NNP'), ('Navigation', 'NNP'), ('SIGN', 'NNP'), ('IN', 'NNP'), ('Pro', 'NNP'), ('Watchlist', 'NNP'), ('Make', 'NNP'), ('It', 'PRP'), ('Select', 'NNP'), ('USA', 'NNP'), ('INTL', 'NNP'), ('Markets', 'NNP'), ('Pre-Markets', 'NNP'), ('U.S.', 'NNP'), ('Markets', 'NNP'), ('Currencies', 'NNP'), ('Cryptocurrency', 'NNP'), ('Futures', 'NNP'), ('&', 'CC'), ('Commodities', 'NNP'), ('Bonds', 'NNP'), ('Funds', 'NNP'), ('&', 'CC'), ('ETFs', 'NNP'), ('Watchlist', 'NNP'), ('Business', 'NNP'), ('Economy', 'NNP'), ('Finance', 'NNP'), ('Health', 'NNP'), ('&', 'CC'), ('Science', 'NNP'), ('Media', 'NNP'), ('Real', 'NNP'), ('Estate', 'NNP'), ('Energy', 'NNP'), ('Transportation', 'NNP'), ('Industrials', 'NNP'), ('Retail', 'NNP'), ('Wealth', 'NNP'), ('Small', 'NNP'), ('Business', 'NNP'), ('Investing', 'NNP'), ('Invest', 'NNP'), ('In', 'IN')