## 0. Text processing (without NLTK)

In [3]:
my_string = 'The quick brown fox jumps over the lazy dog!!!.'

### Lowercase, Punctuation

In [6]:
cleaned = ''.join([char for char in my_string.lower() if char not in '!@.#$%^&*()'])
cleaned

'the quick brown fox jumps over the lazy dog'

### Tokenization

- characters
- words
- sentences


In [7]:
tokenized = cleaned.split()
tokenized 

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

### Stopwords

In [8]:
# remove stopwords in tokenized

stopwords = ['the', 'and', 'this', 'but']

no_stopwords = [word for word in tokenized if word not in stopwords]

no_stopwords

['quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog']

### Stemming/Lemmatization

#### Stemming
- jumped -> jump
- jumps -> jump
- jumping -> jump

#### Lemmatization
- running -> run
- ran -> run
- flies -> fly
- flew -> fly
- flying -> fly


<div class="alert alert-block alert-info">
<b>rstrip()</b>  method removes any trailing characters (characters at the end a string)</div>

In [9]:
# Stem any words that are plural
stemmed = [word.rstrip('s') for word in no_stopwords]

stemmed

['quick', 'brown', 'fox', 'jump', 'over', 'lazy', 'dog']

## N-grams

#### 2-gram
'i do not like data science' => (i, do), (do, not), (not, like), ... , (data, science)

#### 3-gram
=> (i, do, not), (do, not, like), ... , (like, data, science)

In [10]:
sentence = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [12]:
# TODO: 2-gram our list of words
[(sentence[i] , sentence[i+1]) for i in range(len(sentence)-1)]

[('the', 'quick'),
 ('quick', 'brown'),
 ('brown', 'fox'),
 ('fox', 'jumps'),
 ('jumps', 'over'),
 ('over', 'the'),
 ('the', 'lazy'),
 ('lazy', 'dog')]

In [20]:
# Also try it without a loop
    # output => [('quick', 'brown'), ('brown', 'fox'), ...]

#list(zip(sentence[:-1],sentence[1:]))
list(zip(sentence[:-1],sentence[1:]))

[('the', 'quick'),
 ('quick', 'brown'),
 ('brown', 'fox'),
 ('fox', 'jumps'),
 ('jumps', 'over'),
 ('over', 'the'),
 ('the', 'lazy'),
 ('lazy', 'dog')]

## NLTK - Natual Language Toolkit is a leading platform for building Python programs to work with human language data.

## 1. Tokenize Words and Sentences with NLTK

Natural Language toolkit has very important module tokenize which further comprises of sub-modules:

1. word tokenize
2. sentence tokenize

### 1.1 Word tokenize
We use the method word_tokenize() to split a sentence into words. 

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gehadbarakat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.tokenize import word_tokenize
text = "God is Great! I won a lottery."
print(word_tokenize(text))

['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lottery', '.']


### 1.2 Sentence tokenize

In [4]:
from nltk.tokenize import sent_tokenize
print(sent_tokenize(text))

['God is Great!', 'I won a lottery.']


In [16]:
import requests
from bs4 import BeautifulSoup
import re
import nltk

page = requests.get("http://en.wikipedia.org/wiki/Apple_Inc.")
soup = BeautifulSoup(page.content, 'html.parser')
findings = soup.find(attrs={'id':'mw-content-text'}).find_all('p')

for p in findings:
    text = ''.join(re.split('\[\d+\]',p.text))
    if text.strip()!= '':
        print(nltk.tokenize.sent_tokenize(text.strip()))

['Apple Inc. is an American multinational technology company headquartered in Cupertino, California.', "Apple is the world's largest technology company by revenue, with US$394.3 billion in 2022 revenue.", "As of March\xa02023[update], Apple is the world's biggest company by market capitalization.", 'As of June 2022, Apple is the fourth-largest personal computer vendor by unit sales and the second-largest mobile phone manufacturer in the world.', 'It is considered one of the Big Five American information technology companies, alongside Alphabet (parent company of Google), Amazon, Meta Platforms, and Microsoft.']
["Apple was founded as Apple Computer Company on April 1, 1976, by  Steve Wozniak, Steve Jobs (1955–2011) and Ronald Wayne to develop and sell Wozniak's Apple I personal computer.", 'It was incorporated by Jobs and Wozniak as Apple Computer, Inc. in 1977.', "The company's second computer, the Apple II, became a best seller and one of the first mass-produced microcomputers.", 'Ap

In [17]:
findings

[<p class="mw-empty-elt">
 </p>,
 <p><b>Apple Inc.</b> is an American <a href="/wiki/Multinational_corporation" title="Multinational corporation">multinational</a> <a href="/wiki/Technology_company" title="Technology company">technology company</a> headquartered in <a href="/wiki/Cupertino,_California" title="Cupertino, California">Cupertino, California</a>. Apple is the world's <a href="/wiki/List_of_largest_technology_companies_by_revenue" title="List of largest technology companies by revenue">largest technology company by revenue</a>, with <span style="white-space: nowrap"><a href="/wiki/United_States_dollar" title="United States dollar">US$</a>394.3 billion</span> in 2022 revenue.<sup class="reference" id="cite_ref-6"><a href="#cite_note-6">[6]</a></sup> As of March 2023<sup class="plainlinks noexcerpt noprint asof-tag update" style="display:none;"><a class="external text" href="https://en.wikipedia.org/w/index.php?title=Apple_Inc.&amp;action=edit">[update]</a></sup>, Apple is the

## 2. Removing stop words with NLTK

In [None]:
import nltk
nltk.download('stopwords')

In [26]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example ='''Apple Inc. is an American multinational technology 
company headquartered in Cupertino, California.', 
"Apple is the world's largest technology company by revenue, 
with US$394.3 billion in 2022 revenue.", 
"As of March\xa02023[update], Apple is the world's biggest company by
market capitalization.", 'As of June 2022, Apple is the fourth-largest
personal computer vendor by unit sales and the second-largest mobile 
phone manufacturer in the world.', 'It is considered one of 
the Big Five American information technology companies, alongside
Alphabet (parent company of Google), 
Amazon, Meta Platforms, and Microsoft.
stop_words = set(stopwords.words('english'))
'''
word_tokens = word_tokenize(example)

filtered_sentence = [w for w in word_tokens if not w in stop_words] 

filtered_sentence = [] 
for w in word_tokens: 
    if w not in stop_words:
        filtered_sentence.append(w) 
#print(word_tokens)
print(filtered_sentence) 

['Apple', 'Inc.', 'American', 'multinational', 'technology', 'company', 'headquartered', 'Cupertino', ',', 'California', '.', "'", ',', "''", 'Apple', 'world', "'s", 'largest', 'technology', 'company', 'revenue', ',', 'US', '$', '394.3', 'billion', '2022', 'revenue', '.', '``', ',', "''", 'As', 'March', '2023', '[', 'update', ']', ',', 'Apple', 'world', "'s", 'biggest', 'company', 'market', 'capitalization', '.', '``', ',', "'As", 'June', '2022', ',', 'Apple', 'fourth-largest', 'personal', 'computer', 'vendor', 'unit', 'sales', 'second-largest', 'mobile', 'phone', 'manufacturer', 'world', '.', "'", ',', "'It", 'considered', 'one', 'Big', 'Five', 'American', 'information', 'technology', 'companies', ',', 'alongside', 'Alphabet', '(', 'parent', 'company', 'Google', ')', ',', 'Amazon', ',', 'Meta', 'Platforms', ',', 'Microsoft', '.', 'stop_words', '=', 'set', '(', 'stopwords.words', '(', "'english", "'", ')', ')']
