In [46]:
import requests
from bs4 import BeautifulSoup
import re

In [52]:
import os
import nltk
nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data"))

In [2]:
web = 'https://twitter.com/realDonaldTrump'
html_data = requests.get(web, params = {'count': 50}).text
page_content = BeautifulSoup(html_data,'html.parser')
tweets = page_content.find_all('div', class_='js-tweet-text-container')

In [36]:
all_tweets = []
for tweet in tweets:
    all_tweets.append(tweet.p.get_text().strip())

In [35]:
all_tweets[1]

'The people of Venezuela stand at the threshold of history, ready to reclaim their country – and their future....pic.twitter.com/ajxd1EN64c'

## Normalization

#### Lower case conversion

In [41]:
all_tweets = [tweet.lower() for tweet in all_tweets]

In [43]:
all_tweets[2]

'74 years ago today, marines on iwo jima raised the flag atop mount suribachi.\n\nsemper fidelis.pic.twitter.com/usq2z344xp'

#### Remove punctuation Characters

In [48]:
punc_pattern = re.compile(r'[^a-zA-Z0-9]')

In [50]:
all_tweets = [re.sub(punc_pattern,' ',tweet) for tweet in all_tweets]

In [51]:
all_tweets[2]

'74 years ago today  marines on iwo jima raised the flag atop mount suribachi   semper fidelis pic twitter com usq2z344xp'

## NLTK

In [53]:
from nltk.tokenize import TweetTokenizer

#### Tweet Toeknization

In [54]:
tt = TweetTokenizer(preserve_case=False)

In [56]:
for tweet in all_tweets:
    print(tt.tokenize(tweet))

['there', 's', 'not', 'one', 'shred', 'of', 'evidence', 'that', 'this', 'president', 's', 'done', 'anything', 'constitutionally', 'or', 'anything', 'else', 'wrong', 'graham', 'ledger', 'thank', 'you', 'graham', 'so', 'true']
['the', 'people', 'of', 'venezuela', 'stand', 'at', 'the', 'threshold', 'of', 'history', 'ready', 'to', 'reclaim', 'their', 'country', 'and', 'their', 'future', 'pic', 'twitter', 'com', 'ajxd', '1en64c']
['74', 'years', 'ago', 'today', 'marines', 'on', 'iwo', 'jima', 'raised', 'the', 'flag', 'atop', 'mount', 'suribachi', 'semper', 'fidelis', 'pic', 'twitter', 'com', 'usq', '2z344xp']
['there', 'is', 'far', 'more', 'energy', 'on', 'the', 'right', 'than', 'there', 'is', 'on', 'the', 'left', 'that', 's', 'why', 'we', 'just', 'won', 'the', 'senate', 'and', 'why', 'we', 'will', 'win', 'big', 'in', '2020', 'the', 'fake', 'news', 'just', 'doesn', 't', 'want', 'to', 'report', 'the', 'facts', 'border', 'security', 'is', 'a', 'big', 'factor', 'the', 'under', 'construction', 

#### Stop Words

In [57]:
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amiteshsinha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [58]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [59]:
## Remove Stop words

In [60]:
for tweet in all_tweets:
    words = tweet.split()
    non_stopWords = [w for w in words if w not in stopwords.words('english')]
    print(non_stopWords)

['one', 'shred', 'evidence', 'president', 'done', 'anything', 'constitutionally', 'anything', 'else', 'wrong', 'graham', 'ledger', 'thank', 'graham', 'true']
['people', 'venezuela', 'stand', 'threshold', 'history', 'ready', 'reclaim', 'country', 'future', 'pic', 'twitter', 'com', 'ajxd1en64c']
['74', 'years', 'ago', 'today', 'marines', 'iwo', 'jima', 'raised', 'flag', 'atop', 'mount', 'suribachi', 'semper', 'fidelis', 'pic', 'twitter', 'com', 'usq2z344xp']
['far', 'energy', 'right', 'left', 'senate', 'win', 'big', '2020', 'fake', 'news', 'want', 'report', 'facts', 'border', 'security', 'big', 'factor', 'construction', 'wall', 'stop', 'gangs', 'drugs', 'crime']
['god', 'bless', 'people', 'venezuela']
['great', 'new', 'book', 'case', 'trump', 'victor', 'davis', 'hanson', 'hoover', 'senior', 'fellow']
['kelly', 'done', 'outstanding', 'job', 'representing', 'nation', 'doubt', 'leadership', 'country', 'represented', 'highest', 'level', 'congratulations', 'kelly', 'entire', 'family']
['pleas

['hope', 'enjoying', 'president', 'day', 'country', 'making', 'unprecedented', 'progress']
['illegal', 'coup', 'attempt', 'president', 'united', 'states', 'dan', 'bongino', 'foxandfriends', 'true']
['great', 'analysis', 'foxandfriends']


#### Stemming

In [61]:
from nltk.stem.porter import PorterStemmer

In [62]:
for tweet in all_tweets:
    words = tweet.split()
    new_words = [w for w in words if w not in stopwords.words("english")]
    print([PorterStemmer().stem(w) for w in new_words])

['one', 'shred', 'evid', 'presid', 'done', 'anyth', 'constitut', 'anyth', 'els', 'wrong', 'graham', 'ledger', 'thank', 'graham', 'true']
['peopl', 'venezuela', 'stand', 'threshold', 'histori', 'readi', 'reclaim', 'countri', 'futur', 'pic', 'twitter', 'com', 'ajxd1en64c']
['74', 'year', 'ago', 'today', 'marin', 'iwo', 'jima', 'rais', 'flag', 'atop', 'mount', 'suribachi', 'semper', 'fide', 'pic', 'twitter', 'com', 'usq2z344xp']
['far', 'energi', 'right', 'left', 'senat', 'win', 'big', '2020', 'fake', 'news', 'want', 'report', 'fact', 'border', 'secur', 'big', 'factor', 'construct', 'wall', 'stop', 'gang', 'drug', 'crime']
['god', 'bless', 'peopl', 'venezuela']
['great', 'new', 'book', 'case', 'trump', 'victor', 'davi', 'hanson', 'hoover', 'senior', 'fellow']
['kelli', 'done', 'outstand', 'job', 'repres', 'nation', 'doubt', 'leadership', 'countri', 'repres', 'highest', 'level', 'congratul', 'kelli', 'entir', 'famili']
['pleas', 'announc', 'kelli', 'knight', 'craft', 'current', 'ambassador

#### Lemmatization
https://www.nltk.org/api/nltk.stem.html  
https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

In [64]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amiteshsinha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [65]:
from nltk.stem.wordnet import WordNetLemmatizer

In [67]:
wlem = WordNetLemmatizer()

#### <span style='color:green'> Below uses default argument pos = 'n'</span>

In [70]:
for tweet in all_tweets:
    words = tt.tokenize(tweet)
    non_stop_words = [w for w in words if w not in stopwords.words('english')]
    lem_words = [wlem.lemmatize(w) for w in non_stop_words]
    print(lem_words)

['one', 'shred', 'evidence', 'president', 'done', 'anything', 'constitutionally', 'anything', 'else', 'wrong', 'graham', 'ledger', 'thank', 'graham', 'true']
['people', 'venezuela', 'stand', 'threshold', 'history', 'ready', 'reclaim', 'country', 'future', 'pic', 'twitter', 'com', 'ajxd', '1en64c']
['74', 'year', 'ago', 'today', 'marine', 'iwo', 'jima', 'raised', 'flag', 'atop', 'mount', 'suribachi', 'semper', 'fidelis', 'pic', 'twitter', 'com', 'usq', '2z344xp']
['far', 'energy', 'right', 'left', 'senate', 'win', 'big', '2020', 'fake', 'news', 'want', 'report', 'fact', 'border', 'security', 'big', 'factor', 'construction', 'wall', 'stop', 'gang', 'drug', 'crime']
['god', 'bless', 'people', 'venezuela']
['great', 'new', 'book', 'case', 'trump', 'victor', 'davis', 'hanson', 'hoover', 'senior', 'fellow']
['kelly', 'done', 'outstanding', 'job', 'representing', 'nation', 'doubt', 'leadership', 'country', 'represented', 'highest', 'level', 'congratulation', 'kelly', 'entire', 'family']
['ple

#### <span style='color:purple'> Below uses default argument pos = 'v'</span>

In [71]:
for tweet in all_tweets:
    words = tt.tokenize(tweet)
    non_stop_words = [w for w in words if w not in stopwords.words('english')]
    lem_words = [wlem.lemmatize(w,pos = 'v') for w in non_stop_words]
    print(lem_words)

['one', 'shred', 'evidence', 'president', 'do', 'anything', 'constitutionally', 'anything', 'else', 'wrong', 'graham', 'ledger', 'thank', 'graham', 'true']
['people', 'venezuela', 'stand', 'threshold', 'history', 'ready', 'reclaim', 'country', 'future', 'pic', 'twitter', 'com', 'ajxd', '1en64c']
['74', 'years', 'ago', 'today', 'marines', 'iwo', 'jima', 'raise', 'flag', 'atop', 'mount', 'suribachi', 'semper', 'fidelis', 'pic', 'twitter', 'com', 'usq', '2z344xp']
['far', 'energy', 'right', 'leave', 'senate', 'win', 'big', '2020', 'fake', 'news', 'want', 'report', 'facts', 'border', 'security', 'big', 'factor', 'construction', 'wall', 'stop', 'gang', 'drug', 'crime']
['god', 'bless', 'people', 'venezuela']
['great', 'new', 'book', 'case', 'trump', 'victor', 'davis', 'hanson', 'hoover', 'senior', 'fellow']
['kelly', 'do', 'outstanding', 'job', 'represent', 'nation', 'doubt', 'leadership', 'country', 'represent', 'highest', 'level', 'congratulations', 'kelly', 'entire', 'family']
['please',