In [None]:
import nltk
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords

import numpy as np

In [None]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')


In [None]:
import re
import string

from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

def process_tweet(tweet):
  stemmer = PorterStemmer() 
  stopwords_english = stopwords.words('english')

  # remove the stock market tickers
  tweet = re.sub(r'\$\w*', '', tweet)

  # remove the old styles retweet text 'RT'
  tweet = re.sub(r'^RT[\s]+', '', tweet)

  # remove the hyperlinks
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

  # remove the # symbol
  tweet = re.sub(r'#', '', tweet)

  # Tokenize the tweet
  tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
  tweet_tokens = tokenizer.tokenize(tweet)

  tweet_clean = []

  # removing stopwords and punctuation
  for word in tweet_tokens:
    if (word not in stopwords_english and
        word not in string.punctuation):
      stem_word = stemmer.stem(word)    #stemming
      tweet_clean.append(stem_word)

  return tweet_clean

In [None]:
def count_tweets(tweets, ys):
  ys_list = np.squeeze(ys).tolist()
  freqs ={}

  for y, tweet in zip(ys_list, tweets):
    for word in process_tweet(tweet):
      pair = (word, y)
      if pair in freqs:
        freqs[pair] +=1
      else:
        freqs[pair] = 1
  
  return freqs

In [None]:
def lookup(freqs, word, label):
  n = 0
  pair = (word, label)
  if pair in freqs:
    n = freqs[pair]
  return n 

In [None]:
# splitting the data for training and testing 
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]

train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# numpy array for the labels in the training set
train_y = np.append(np.ones((len(train_pos))), np.zeros((len(train_neg))))
test_y = np.append(np.ones((len(test_neg))), np.zeros((len(test_neg))))

In [None]:
# Build a frequency dictionary
freqs = count_tweets(train_x, train_y)

def train_naive_bayes(freqs, train_x, train_y):
  logliklihood = {}
  logprior = 0

  # calculate V, number of unique words in the vocabulary
  vocab = set([pair[0] for pair in freqs.keys()])
  V = len(vocab)

  ## Calculate N_pos, N_neg, V_pos, V_neg
  # N_pos : total number of positive words
  # N_neg : total number of negative words
  # V_pos : total number of unique positive words
  # V_neg : total number of unique negative words

  N_pos = N_neg = V_pos = V_neg = 0
  for pair in freqs.keys():
    if pair[1]>0:
      V_pos +=1
      N_pos += freqs[pair]
    else:
      V_neg +=1
      N_neg += freqs[pair]

  # Number of Documents (tweets)
  D = len(train_y)

  # D_pos, number of positive documnets
  D_pos = len(list(filter(lambda x: x>0, train_y)))

  # D_pos, number of negative documnets
  D_neg = len(list(filter(lambda x: x<=0, train_y)))

  # calculate the logprior
  logprior = np.log(D_pos) - np.log(D_neg)

  for word in vocab:
    freqs_pos = lookup(freqs, word, 1)
    freqs_neg = lookup(freqs, word, 0)

    # calculte the probability of each word being positive and negative
    p_w_pos = (freqs_pos+1)/(N_pos+V)
    p_w_neg = (freqs_neg+1)/(N_neg+V)

    logliklihood[word] = np.log(p_w_pos/p_w_neg)
  
  return logprior, logliklihood

In [None]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9086


In [None]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
  word_l = process_tweet(tweet)
  p = 0
  p+=logprior

  for word in word_l:
    if word in loglikelihood:
      p+=loglikelihood[word]

  return p

In [None]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
  accuracy = 0
  y_hats = []
  for tweet in test_x:
    if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
      y_hat_i = 1
    else:
      y_hat_i = 0
    y_hats.append(y_hat_i)
  error = np.mean(np.absolute(test_y - y_hats))
  accuracy = 1-error

  return accuracy
  
  print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

99,4%

In [None]:
m=['Same folks said daikon paste could treat a cytokine storm PfizerBioNTech','coronavirus SputnikV AstraZeneca PfizerBioNTech Moderna Covid_19 Russian vaccine is created to last 2-4 years']
for k in m:
  p=naive_bayes_predict(k,logprior, loglikelihood)
  print(p)

1.1654634865206077
-1.980715135296889


In [None]:
p

-1.980715135296889

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

In [None]:
path = '/content/drive/My Drive/Colab Notebooks/vaccination_tweets.csv'

df = pd.read_csv(path, usecols=['text'])
df.head()

Unnamed: 0,text
0,Same folks said daikon paste could treat a cyt...
1,While the world has been on the wrong side of ...
2,#coronavirus #SputnikV #AstraZeneca #PfizerBio...
3,"Facts are immutable, Senator, even when you're..."
4,Explain to me again why we need a vaccine @Bor...


In [None]:
def cleanTxt(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'RT[\s]+', '', text)
    text = re.sub(r'https?:\/\/\S+', '', text)

    return text


df['text'] = df['text'].apply(cleanTxt)
df.head()

Unnamed: 0,text
0,Same folks said daikon paste could treat a cyt...
1,While the world has been on the wrong side of ...
2,coronavirus SputnikV AstraZeneca PfizerBioNTec...
3,"Facts are immutable, Senator, even when you're..."
4,Explain to me again why we need a vaccine wh...


In [None]:
addpol=[]
k=df['text'].tolist()
for o in k:
  g= naive_bayes_predict(o,logprior, loglikelihood)
  addpol.append("{:0.4f}".format(g))
df['text8']=addpol 
addpol


['1.1655',
 '0.0413',
 '-1.2179',
 '2.3755',
 '0.8607',
 '0.3496',
 '-1.9008',
 '2.2082',
 '-0.9095',
 '2.4196',
 '1.2304',
 '1.2780',
 '6.9629',
 '0.2452',
 '-0.8191',
 '-0.7020',
 '1.4179',
 '-1.4587',
 '-3.3062',
 '0.1308',
 '-4.5512',
 '1.6189',
 '-0.7020',
 '2.2939',
 '-0.7343',
 '2.6214',
 '0.7389',
 '3.3376',
 '4.4607',
 '-0.0538',
 '2.4779',
 '1.2865',
 '0.2564',
 '3.9559',
 '-1.3116',
 '-0.1940',
 '-0.8987',
 '1.8476',
 '0.1049',
 '-1.9548',
 '0.9236',
 '0.5647',
 '-0.2569',
 '-0.7020',
 '0.4515',
 '-1.0844',
 '0.0747',
 '3.0208',
 '-1.2225',
 '3.1444',
 '-2.5098',
 '0.5424',
 '-0.0422',
 '3.0413',
 '-0.3059',
 '0.0342',
 '2.9939',
 '2.2692',
 '2.9513',
 '-1.4692',
 '-0.3935',
 '2.0686',
 '-2.0591',
 '-0.1977',
 '2.3489',
 '-0.0690',
 '0.0165',
 '2.1032',
 '-1.4198',
 '-0.1434',
 '1.8993',
 '3.6555',
 '-0.0631',
 '-1.2294',
 '2.2465',
 '1.4762',
 '2.6471',
 '-0.5873',
 '-1.2007',
 '3.8117',
 '1.0015',
 '3.1747',
 '2.6910',
 '-0.2129',
 '-1.4169',
 '-1.7540',
 '0.1526',
 '3.478

In [None]:
df

Unnamed: 0,text,text8
0,Same folks said daikon paste could treat a cyt...,1.1655
1,While the world has been on the wrong side of ...,0.0413
2,coronavirus SputnikV AstraZeneca PfizerBioNTec...,-1.2179
3,"Facts are immutable, Senator, even when you're...",2.3755
4,Explain to me again why we need a vaccine wh...,0.8607
...,...,...
8077,"Off for my second vaccine, then work. 👋💉\nCure...",-0.0131
8078,“When” appears to be a million-dollar question...,0.9341
8079,Second PfizerBioNTech vaccine shot done! Thank...,2.7372
8080,Second PfizerBioNTech vaccine is officially in...,3.8032


In [None]:
def getAnalysis(score):
  if score > 0.5:
    return 'Negative'
  elif score < 0:
    return 'Positive'
  else:
    return 'Neutral'

In [None]:
df['text8']=df['text8']
df['text9']=df['text8'].astype(float).apply(getAnalysis)

In [None]:
df


Unnamed: 0,text,text3,text8,text9
0,Same folks said daikon paste could treat a cyt...,-0.987422,1.1655,Negative
1,While the world has been on the wrong side of ...,-0.987422,0.0413,Neutral
2,coronavirus SputnikV AstraZeneca PfizerBioNTec...,-0.987422,-1.2179,Positive
3,"Facts are immutable, Senator, even when you're...",-0.987422,2.3755,Negative
4,Explain to me again why we need a vaccine wh...,-0.987422,0.8607,Negative
...,...,...,...,...
8077,"Off for my second vaccine, then work. 👋💉\nCure...",-0.987422,-0.0131,Positive
8078,“When” appears to be a million-dollar question...,-0.987422,0.9341,Negative
8079,Second PfizerBioNTech vaccine shot done! Thank...,-0.987422,2.7372,Negative
8080,Second PfizerBioNTech vaccine is officially in...,-0.987422,3.8032,Negative


In [None]:
Neutral = len(df[df['text9']=='Neutral'])
Negative = len(df[df['text9']=='Negative'])
Positive = len(df[df['text9']=='Positive'])
labels = ['Negative','Positive','Neutral']
values = [Negative,Positive,Neutral]
#====
import plotly.graph_objects as go
colors = ['darkred','green', 'darkblue' ]

fig = go.Figure(data=[go.Pie(labels=labels,
                             values=values)])
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=20,textposition='inside',
                  marker=dict(colors=colors, line=dict(color='black', width=1)))
fig.show()