<a href="https://colab.research.google.com/github/fagoon1311/LGMVIP-DataScience/blob/main/SentimentLg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import nltk, re, string
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pickle

In [28]:
def process_tweet(tweet):
  stemmer = nltk.PorterStemmer()
  stopwords_english = stopwords.words('english')
  tweet = re.sub(r'\$\w*', '', tweet)
  #The "$" character in the regular expression is used to match the "$" symbol, and "\w*" is used to match any word character (letters, digits, or underscore) zero or more 
  #times after the "$" symbol.
  #So, the regular expression "$\w*" matches any word starting with a "$" symbol, such as $BTC or $ETH.
  tweet = re.sub(r'^RT[\s]+','', tweet)
  #The "^" character in the regular expression is used to match the start of the string, so this pattern only matches "RT" if it is at the beginning of the string. 
  #The "[\s]+" pattern matches one or more 
  #whitespace characters (including spaces, tabs, and line breaks) that follow the "RT" sequence.
  tweet = re.sub(r'https?:\/\/.*[\r\n]*','', tweet)
  #The regular expression pattern starts with "https?://" which matches "http://" or "https://". The "s?" indicates that the "s" character is optional. 
  #Then, ".*" matches any character zero or more times (except for new line characters), until the end of the URL is reached (which is indicated by a space or a line break).

  #The "[\r\n]*" pattern at the end of the regular expression matches any number of carriage return ("\r") or newline ("\n") characters that may follow the URL.
  tweet = re.sub(r'#','', tweet)
  tokenizer = nltk.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
  tweet_tokens = tokenizer.tokenize(tweet)

  tweets_clean = []
  for word in tweet_tokens:
    if (word not in stopwords_english and word not in string.punctuation):
      stem_word = stemmer.stem(word)
      tweets_clean.append(stem_word)
  return tweets_clean

In [29]:
def build_freqs(tweets, ys):
  """
  build Frequencies
  Input: 
  tweets - a list of tweets
  ys - an mx1 array with sentiment label of each tweet

  Output:
  Freqs: a dictionary mapping each word, sentiment pair to its frequency

  """
  # convert np array to list since zip needs an iterable
  # The sequence is neccesary or the list ends up with one element
  # Also note that this is just a NOP if ys is already a list
  yslist = np.squeeze(ys).tolist()

  # Start with an empty dictionary and populate it with looping over all tweets
  # and over all processed words in each tweet.
  freqs = {}
  for y, tweet in zip(yslist, tweets):
    for word in process_tweet(tweet):
      pair = (word, y)
      if pair in freqs:
        freqs[pair] += 1
      else:
        freqs[pair] = 1
  return freqs

In [30]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
# Checking the above code with an example.
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1,0,0,0,0]
res = build_freqs(tweets, ys)
print(res)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}


In [32]:
# select theset of + and - tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [33]:
# splitting data into 2 pieces to train and test
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

In [34]:
train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [35]:
# combine postive and negative labels
# we are building our y- target variable here
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [36]:
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

In [37]:
# checking the op
print("type(freqs = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs = <class 'dict'>
len(freqs) = 11337


In [38]:
# TEST THE FN BELOW
print('This is an example of a positive tweet: \n', train_x[22])
print('\nThis is an example of processed version of the tweet: \n', process_tweet(train_x[22]))

This is an example of a positive tweet: 
 @gculloty87 Yeah I suppose she was lol! Chat in a bit just off out x :))

This is an example of processed version of the tweet: 
 ['yeah', 'suppos', 'lol', 'chat', 'bit', 'x', ':)']


In [39]:
# building a logistic model from the scratch
# Logistic Func
# sigmoid Function
def sigmoid(z):
  zz = np.negative(z)
  h = 1/(1+np.exp(zz))
  return h

In [40]:
def gradientDescent(x, y, theta, alpha, num_iters):
  m = x.shape[0]
  for i in range(0, num_iters):
    z = np.dot(x, theta)
    h = sigmoid(z)
    cost = -1./m* (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(), np.log(1 - h)))
    theta = theta - (alpha/m) * np.dot(x.transpose(), (h - y))
  cost = float(cost)
  return cost, theta



In [41]:
# extracted the features
def extract_features(tweet, freqs):
  word_1 = process_tweet(tweet)
  x = np.zeros((1,3))
  x[0,0]=1
  for word in word_1:
    # increment the word count for the positive label 1
    x[0,1] += freqs.get((word, 1.0), 0)
    # increment the word count for the negative label 0
    x[0,2] += freqs.get((word, 0.0), 0)

  assert(x.shape == (1,3))
  return x 

In [42]:
# test on training data
tmp1 = extract_features(train_x[22], freqs)
print(tmp1)

[[1.000e+00 3.006e+03 1.240e+02]]


In [43]:
# training the model
# collect the features 'x' and stack them into a matrix x
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
  X[i, :] = extract_features(train_x[i], freqs)

  # training lables corresponding to  x
  Y = train_y
  # apply gradient descent
  # there values are predefined
  J, theta = gradientDescent(X, Y, np.zeros((3,1)), 1e-9, 1500)

In [44]:
def predict_tweet(tweet, freqs, theta):
  x = extract_features(tweet, freqs)
  y_pred = sigmoid(np.dot(x, theta))
  return y_pred
  

In [45]:
def test_logistic_regression(test_x, test_y, freqs, theta):
  y_hat = []
  for tweet in test_x:
    y_pred = predict_tweet(tweet, freqs, theta)
    if y_pred>0.5:
      y_hat.append(1)
    else:
      y_hat.append(0)
  accuracy = (y_hat == np.squeeze(test_y)).sum() / len(test_x)

In [46]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(tmp_accuracy)

None


In [47]:
def pre(sentence):
  yhat = predict_tweet(sentence, freqs, theta)
  if yhat > 0.5:
    return 'Positive Sentiment'
  elif yhat==0:
    return 'Neutral sentiment'
  else:
    return 'Negative Sentiment'

In [48]:
my_tweet = 'It is so hot today but it is the perfect day for a beach party'
res = pre(my_tweet)
print(res)

Positive Sentiment
