## Import Packages

In [1]:
import re
from nltk import word_tokenize, sent_tokenize, ngrams, pos_tag, RegexpParser
from nltk.corpus import stopwords
from collections import Counter
from bs4 import BeautifulSoup
from bs4.element import Comment
from urllib import request
from pprint import pprint

## 1. Use urllib or requests package to read this CNBC article through its URL link.

## 2. Use BeautifulSoup (Links to an external site.) or another HTML parsing package to extract text from the article.

In [2]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

url = 'https://www.cnbc.com/2019/01/17/netflix-price-hike-helps-disney-upcoming-streaming-service-analyst.html'
html = request.urlopen(url).read()
result = text_from_html(html)


## 3. Use re (regular expression) package to:  
### 3a.  Find all matches of $ amounts in the article  


In [3]:
re.findall(r'\$\d+\w', result)

['$325', '$351']

### 3b. Substitute all numbers with # character and print the output

In [4]:
print(re.sub('[0-9]', '#', result))

Skip Navigation Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Business Economy Finance Health & Science Media Real Estate Energy Climate Transportation Industrials Retail Wealth Life Small Business Investing Invest In You Personal Finance Fintech Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media CNBC Disruptor ## Tech Guide Politics White House Policy Defense Congress Equity and Opportunity CNBC TV Live TV Live Audio Business Day Shows The News with Shepard Smith Entertainment Shows Full Episodes Latest Video Top Video CEO Interviews CNBC Documentaries CNBC Podcasts CNBC World Digital Originals Live TV Schedule Watchlist PRO Pro News Pro Live Subscribe Sign In Menu Make It USA INTL watch live Search quotes, news & videos SIGN IN Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & 

### 3c. Count (using regular expressions) ”Netflix” and “Disney” mentions 

In [5]:
len(re.findall('Disney|Netflix', result))

20

## 4. Use NTLK and/or Spacy tokenization features to:  

### 4a. Tokenize sentences and words

In [6]:
sentences = sent_tokenize(result)
words = word_tokenize(result)

### 4b. Remove all English stop words

In [7]:
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if not w.lower() in stop_words]

### 4c. List and count n-grams for any given input n

In [8]:
def countNGrams(sentence, n):
    return Counter(ngrams(sentence,n))
countNGrams(filtered_words, 3)

Counter({('Skip', 'Navigation', 'Markets'): 1,
         ('Navigation', 'Markets', 'Pre-Markets'): 1,
         ('Markets', 'Pre-Markets', 'U.S.'): 2,
         ('Pre-Markets', 'U.S.', 'Markets'): 2,
         ('U.S.', 'Markets', 'Currencies'): 2,
         ('Markets', 'Currencies', 'Cryptocurrency'): 2,
         ('Currencies', 'Cryptocurrency', 'Futures'): 2,
         ('Cryptocurrency', 'Futures', '&'): 2,
         ('Futures', '&', 'Commodities'): 2,
         ('&', 'Commodities', 'Bonds'): 2,
         ('Commodities', 'Bonds', 'Funds'): 2,
         ('Bonds', 'Funds', '&'): 2,
         ('Funds', '&', 'ETFs'): 2,
         ('&', 'ETFs', 'Business'): 2,
         ('ETFs', 'Business', 'Economy'): 2,
         ('Business', 'Economy', 'Finance'): 2,
         ('Economy', 'Finance', 'Health'): 2,
         ('Finance', 'Health', '&'): 2,
         ('Health', '&', 'Science'): 2,
         ('&', 'Science', 'Media'): 2,
         ('Science', 'Media', 'Real'): 2,
         ('Media', 'Real', 'Estate'): 2,
      

### 4d. Print bigrams and trigrams in the first 5 sentences

In [9]:
for i, sentence in enumerate(sentences[:5]):
    
    tokens = word_tokenize(sentence)
    print(f"============================ SENTENCE {i+1} BIGRAM ============================\n")
    pprint(Counter(ngrams(tokens, 2)))
    print(f"============================ SENTENCE {i+1} TRIGRAM ============================\n")
    pprint(Counter(ngrams(tokens, 3)))
    print('\n\n\n')


Counter({('TV', 'Live'): 4,
         ('Live', 'TV'): 4,
         ('Share', 'Article'): 4,
         ('Article', 'via'): 4,
         ('Markets', 'Pre-Markets'): 2,
         ('Pre-Markets', 'U.S.'): 2,
         ('U.S.', 'Markets'): 2,
         ('Markets', 'Currencies'): 2,
         ('Currencies', 'Cryptocurrency'): 2,
         ('Cryptocurrency', 'Futures'): 2,
         ('Futures', '&'): 2,
         ('&', 'Commodities'): 2,
         ('Commodities', 'Bonds'): 2,
         ('Bonds', 'Funds'): 2,
         ('Funds', '&'): 2,
         ('&', 'ETFs'): 2,
         ('ETFs', 'Business'): 2,
         ('Business', 'Economy'): 2,
         ('Economy', 'Finance'): 2,
         ('Finance', 'Health'): 2,
         ('Health', '&'): 2,
         ('&', 'Science'): 2,
         ('Science', 'Media'): 2,
         ('Media', 'Real'): 2,
         ('Real', 'Estate'): 2,
         ('Estate', 'Energy'): 2,
         ('Energy', 'Climate'): 2,
         ('Climate', 'Transportation'): 2,
         ('Transportation', 'Industrials

### 4e. Print POS tags in the first 5 sentences

In [10]:
for i, sentence in enumerate(sentences[:5]):
    
    tokens = word_tokenize(sentence)
    sentence_pos = pos_tag(tokens)
    print(f"============================ SENTENCE {i+1} POS TAGS ============================\n")
#     print(sentence)
    pprint(sentence_pos)

    print('\n\n\n')


[('Skip', 'JJ'),
 ('Navigation', 'NNP'),
 ('Markets', 'NNP'),
 ('Pre-Markets', 'NNP'),
 ('U.S.', 'NNP'),
 ('Markets', 'NNP'),
 ('Currencies', 'NNP'),
 ('Cryptocurrency', 'NNP'),
 ('Futures', 'NNP'),
 ('&', 'CC'),
 ('Commodities', 'NNP'),
 ('Bonds', 'NNP'),
 ('Funds', 'NNP'),
 ('&', 'CC'),
 ('ETFs', 'NNP'),
 ('Business', 'NNP'),
 ('Economy', 'NNP'),
 ('Finance', 'NNP'),
 ('Health', 'NNP'),
 ('&', 'CC'),
 ('Science', 'NNP'),
 ('Media', 'NNP'),
 ('Real', 'NNP'),
 ('Estate', 'NNP'),
 ('Energy', 'NNP'),
 ('Climate', 'NNP'),
 ('Transportation', 'NNP'),
 ('Industrials', 'NNP'),
 ('Retail', 'NNP'),
 ('Wealth', 'NNP'),
 ('Life', 'NNP'),
 ('Small', 'NNP'),
 ('Business', 'NNP'),
 ('Investing', 'NNP'),
 ('Invest', 'NNP'),
 ('In', 'IN'),
 ('You', 'PRP'),
 ('Personal', 'NNP'),
 ('Finance', 'NNP'),
 ('Fintech', 'NNP'),
 ('Financial', 'NNP'),
 ('Advisors', 'NNPS'),
 ('Trading', 'NNP'),
 ('Nation', 'NN'),
 ('Options', 'NNP'),
 ('Action', 'NNP'),
 ('ETF', 'NNP'),
 ('Street', 'NNP'),
 ('Buffett', 'NNP')