<a href="https://colab.research.google.com/github/dheerajjoshim/machinelearningcourse/blob/master/Sentiment_Analysis_with_Logistic_Regression_(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Dataset

In [1]:
import nltk
from nltk.corpus import twitter_samples
import numpy as np
nltk.download('twitter_samples')
positive_tweets =twitter_samples.strings('positive_tweets.json')
negative_tweets =twitter_samples.strings('negative_tweets.json')
example_postive_tweet=positive_tweets[1]
example_negative_tweet=negative_tweets[0]
test_pos = positive_tweets[4000:]
train_pos = positive_tweets[:4000]
test_neg = negative_tweets[4000:]
train_neg = negative_tweets[:4000]
train_x = train_pos + train_neg 
test_x = test_pos + test_neg
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


In [19]:
print("positive tweet-> ",example_postive_tweet)
print("negative tweet-> ",example_negative_tweet)

positive tweet->  @Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
negative tweet->  hopeless for tmr :(


# Feature Engineering

## Tokenizing

In [3]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
tokens = tokenizer.tokenize(example_postive_tweet)
tokens

['@Lamb2ja',
 'Hey',
 'James',
 '!',
 'How',
 'odd',
 ':/',
 'Please',
 'call',
 'our',
 'Contact',
 'Centre',
 'on',
 '02392441234',
 'and',
 'we',
 'will',
 'be',
 'able',
 'to',
 'assist',
 'you',
 ':)',
 'Many',
 'thanks',
 '!']

## Removing Stopwords

In [4]:
from nltk.corpus import stopwords  
nltk.download('stopwords')
stopwords_english = stopwords.words('english') 
stopwords_english[:10]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [5]:
import string
tweet_processsed=[word for word in tokens 
    if word not  in stopwords_english and word not in string.punctuation]
tweet_processsed

['@Lamb2ja',
 'Hey',
 'James',
 'How',
 'odd',
 ':/',
 'Please',
 'call',
 'Contact',
 'Centre',
 '02392441234',
 'able',
 'assist',
 ':)',
 'Many',
 'thanks']

## Stemming

In [6]:
from nltk.stem import PorterStemmer  
stemmer = PorterStemmer() 
tweet_after_stem=[]
for word in tweet_processsed:
    word=stemmer.stem(word)
    tweet_after_stem.append(word)
tweet_after_stem

['@lamb2ja',
 'hey',
 'jame',
 'how',
 'odd',
 ':/',
 'pleas',
 'call',
 'contact',
 'centr',
 '02392441234',
 'abl',
 'assist',
 ':)',
 'mani',
 'thank']

In [20]:
import re                                  
import string
from nltk.corpus import stopwords          
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer
nltk.download('stopwords')
def text_process(tweet):
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tweet = re.sub(r'@[\w\d]+', '', tweet)
    tokenizer = TweetTokenizer()
    tweet_tokenized = tokenizer.tokenize(tweet)
    stopwords_english = stopwords.words('english') 
    tweet_processsed=[word for word in tweet_tokenized 
    if word not  in stopwords_english and word not in       
    string.punctuation]
    stemmer = PorterStemmer() 
    tweet_after_stem=[]
    for word in tweet_processsed:
        word=stemmer.stem(word)
        tweet_after_stem.append(word)
    return tweet_after_stem

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
print(text_process(example_postive_tweet))

['hey', 'jame', 'how', 'odd', ':/', 'pleas', 'call', 'contact', 'centr', '02392441234', 'abl', 'assist', ':)', 'mani', 'thank']


# Word Encodings

In [21]:
pos_words=[]
for tweet in train_pos:
    tweet=text_process(tweet)
    
    for word in tweet:
        
        pos_words.append(word)
freq_pos={}
for word in pos_words:
    if (word,1) not in freq_pos:
        freq_pos[(word,1)]=1
    else:
        freq_pos[(word,1)]=freq_pos[(word,1)]+1
neg_words=[]
for tweet in train_neg:
    tweet=text_process(tweet)
    
    for word in tweet:
        
        neg_words.append(word)
freq_neg={}
for word in neg_words:
    if (word,0) not in freq_neg:
        freq_neg[(word,0)]=1
    else:
        freq_neg[(word,0)]=freq_neg[(word,0)]+1
freqs_dict = dict(freq_pos)
freqs_dict.update(freq_neg)

In [22]:
def features_extraction(tweet, freqs_dict):
    word_l = text_process(tweet)
    x = np.zeros((1, 3))
    x[0,0] = 1 
    for word in word_l:
        try:
            x[0,1] += freqs_dict[(word,1)]
        except:
            x[0,1] += 0
        try: 
            x[0,2] += freqs_dict[(word,0.0)]
        except:
            x[0,2] += 0
    assert(x.shape == (1, 3))
    return x

# Logistic Regression

In [23]:
def sigmoid(x): 
    h = 1/(1+np.exp(-x))
    return h

def gradientDescent(x, y, theta, alpha, num_iters):
    m = x.shape[0]
    for i in range(0, num_iters):
        z = np.dot(x,theta)
        h = sigmoid(z)
        J = -1/m*(np.dot(y.T,np.log(h))+np.dot((1-y).T,np.log(1-h)))
        theta = theta-(alpha/m)*np.dot(x.T,h-y)
    J = float(J)
    return J, theta

In [24]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= features_extraction(train_x[i], freqs_dict)
Y = train_y
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)

# Prediction and Model Accuracy

In [25]:
def test_accuracy_with_rule_based_model(test_x, test_y):
  y_hat1 = []
  for each in test_x:
    if each[1]>each[2]:
      y_hat1.append(1)
    else:
      y_hat1.append(0)
  m=len(y_hat1)
  y_hat1=np.array(y_hat1)
  y_hat1=y_hat1.reshape(m)
  test_y=test_y.reshape(m)

  c=y_hat1==test_y
  j=0
  j= len([x for x in c if x==True])
  accuracy1 = j/m
  return accuracy1
accuracy1 = test_accuracy_with_rule_based_model(test_x, test_y)

In [26]:
print(accuracy1*100,'%')

49.95 %


In [27]:
def predict(tweet, freqs_dict, theta):
    x = features_extraction(tweet,freqs_dict)
    y_pred = sigmoid(np.dot(x,theta))
    return y_pred
def test_accuracy_with_logistic_regression(test_x, test_y, freqs_dict, theta):
    y_hat = []
    for tweet in test_x:
        
        y_pred = predict(tweet, freqs_dict, theta)
        
        if y_pred > 0.5:
           
            y_hat.append(1)
        else:
            
            y_hat.append(0)
    m=len(y_hat)
    y_hat=np.array(y_hat)
    y_hat=y_hat.reshape(m)
    test_y=test_y.reshape(m)
    
    c=y_hat==test_y
    j=0
    j= len([x for x in c if x==True])
    accuracy = j/m
    return accuracy
accuracy = test_accuracy_with_logistic_regression(test_x, test_y, freqs_dict, theta)

In [28]:
print(accuracy*100,'%')

97.8 %


In [29]:
def test_your_own_tweet(tweet, freqs_dict, theta):
  y_pred = predict(tweet, freqs_dict, theta)

  if y_pred > 0.5:
      
      print("positve")
  else:
      
      print("negative")
tweet = "I'm happy, not sad"
test_your_own_tweet(tweet, freqs_dict, theta)

negative
