<a href="https://colab.research.google.com/github/esraa-abdelmaksoud/Shai-Training-Notebooks/blob/main/Twitter_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import packages
import nltk
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
import re
from collections import Counter
from itertools import chain

In [2]:
# Download some tweets
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
# Select all positive and negative tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
# Print tweets count
print(f'positive count: {len(positive_tweets)}')
print(f'negative count: {len(negative_tweets)}')

positive count: 5000
negative count: 5000


In [5]:
# Check data type
print(type(positive_tweets))

<class 'list'>


In [6]:
# Load the tokenizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [7]:
# Load stop words
nltk.download('stopwords')
stopwords_list = stopwords.words('english') 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Load lemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
for i in range(len(positive_tweets)):
  # Convert to lowercase
  positive_tweets[i] = positive_tweets[i].lower()
  negative_tweets[i] = negative_tweets[i].lower()
  # Remove links
  positive_tweets[i] = re.sub('(http)\S+', '', positive_tweets[i])
  negative_tweets[i] = re.sub('(http)\S+', '', negative_tweets[i])
  # Remove RT
  positive_tweets[i] = re.sub('(RT)\s(@)\S+', '', positive_tweets[i])
  negative_tweets[i] = re.sub('(RT)\s(@)\S+', '', negative_tweets[i])
  # Remove mentions
  positive_tweets[i] = re.sub('(@)\S+', '', positive_tweets[i])
  negative_tweets[i] = re.sub('(@)\S+', '', negative_tweets[i])
  # Remove hashtags
  positive_tweets[i] = re.sub('(#)\S+', '', positive_tweets[i])
  negative_tweets[i] = re.sub('(#)\S+', '', negative_tweets[i])
  # Remove HTML tags
  positive_tweets[i] = re.sub('[<](a href)*[/]*\S+[>]', '', positive_tweets[i])
  negative_tweets[i] = re.sub('[<](a href)*[/]*\S+[>]', '', negative_tweets[i])
  # Lemmatize text
  positive_tweets[i] = lemmatizer.lemmatize(positive_tweets[i])
  negative_tweets[i] = lemmatizer.lemmatize(negative_tweets[i])
  # Tokenize tweets, remove emojis, punctuation
  positive_tweets[i] = tokenizer.tokenize(positive_tweets[i])
  negative_tweets[i] = tokenizer.tokenize(negative_tweets[i])
  # Remove stop words
  positive_tweets[i] = [word for word in positive_tweets[i] if word not in stopwords_list]
  negative_tweets[i] = [word for word in negative_tweets[i] if word not in stopwords_list]


In [10]:
# Checking a tweet after text preprocessing
print(positive_tweets[20])

['top', 'new', 'followers', 'community', 'week']


In [11]:
# Flatten lists
pos_words = list(chain(*positive_tweets))
neg_words = list(chain(*negative_tweets))

In [12]:
# Counting words
pos_counts = Counter(pos_words,sorted=True)
neg_counts = Counter(neg_words,sorted=True)

In [14]:
# Getting most common words
mc_pos_keys = [pos_counts.most_common(20)[i][0] for i in range(20)] 
mc_neg_keys = [neg_counts.most_common(20)[i][0] for i in range(20)] 

In [15]:
print(mc_pos_keys)
print(mc_neg_keys)

['thanks', 'follow', 'love', 'u', 'thank', 'good', 'like', 'day', 'happy', 'amp', 'great', 'hi', '3', 'get', 'see', 'back', 'know', 'lt', 'new', 'p']
['please', 'miss', 'want', 'like', 'u', 'get', 'sorry', 'one', 'follow', 'time', 'much', 'go', 'really', 'love', 'know', 'im', 'still', 'sad', 'back', 'today']


In [16]:
lc_pos_keys = list(pos_counts)[-21:]
lc_neg_keys = list(neg_counts)[-21:]

In [17]:
print(lc_pos_keys)
print(lc_neg_keys)

['fledged', 'workplace', 'venue', 'lagos', 'luxord', 'kingdom', 'potatos', 'hundreds', 'cited', 'academic', 'pokiri', '1nenokkadine', 'favs', 'heritage', 'wood', 'beleaf', 'peasant', 'ahahha', 'reminders', 'distant', 'adulthood']
['konami', 'policy', 'pes', 'rantie', 'atm', 'perverse', 'bracelets', 'twins', 'bylfnnz', 'banned', 'press', 'duper', 'waaah', 'jaebum', 'ahmad', 'maslan', 'cooks', 'hull', 'supporter', 'expecting', 'misserable']


In [18]:
# Getting least common words
for i in range(20):
  pos_counts.pop(mc_pos_keys[i])
  neg_counts.pop(mc_neg_keys[i])
  pos_counts.pop(lc_pos_keys[i])
  neg_counts.pop(lc_neg_keys[i])
  pos_words.remove(mc_pos_keys[i])
  neg_words.remove(mc_neg_keys[i])
  pos_words.remove(lc_pos_keys[i])
  neg_words.remove(lc_neg_keys[i])

In [19]:
print(pos_counts.most_common(20))
print(neg_counts.most_common(20))

[('one', 127), ('hope', 123), ('today', 115), ('us', 115), ('time', 113), ('friday', 101), ('nice', 100), ('morning', 98), ('please', 96), ('let', 93), ('much', 89), ('would', 85), ('via', 85), ('go', 82), ('well', 81), ('really', 79), ('gt', 78), ('hey', 77), ('lot', 77), ('1', 75)]
[('followed', 110), ('see', 108), ('amp', 101), ('good', 99), ('feel', 99), ('got', 99), ('day', 96), ('need', 95), ('wanna', 94), ('oh', 92), ('work', 91), ('wish', 88), ('going', 86), ('sleep', 82), ('thanks', 77), ('people', 76), ('would', 72), ('hope', 72), ('3', 72), ('could', 72)]


In [20]:
print(pos_words[:20])
print(neg_words[:20])

['top', 'engaged', 'members', 'community', 'week', 'hey', 'james', 'odd', 'please', 'call', 'contact', 'centre', '02392441234', 'able', 'assist', 'many', 'listen', 'last', 'night', 'bleed']
['hopeless', 'tmr', 'everything', 'kids', 'section', 'ikea', 'cute', 'shame', 'nearly', '19', '2', 'months', 'heart', 'sliding', 'waste', 'basket', 'hate', 'japanese', 'call', 'bani']
