# Sentiment Analysis with Logistic Regression
Given a tweet, decide if it has a positive sentiment or a negative one. 
* Extract features for logistic regression
* Logistic regression implementation
* Error  analysis

In [1]:
import nltk
from nltk.corpus import twitter_samples
import numpy as np
import pandas as pd
from utils import process_tweet, build_freqs

nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/d062562/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/d062562/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# load the data
positive_tweets = twitter_samples.strings("positive_tweets.json")
negative_tweets = twitter_samples.strings("negative_tweets.json")

In [3]:
# split the data to train and test sets
n = len(positive_tweets)
train_n = int(0.8 * n)
test_n = n - train_n

train_pos_x = positive_tweets[:train_n]
train_neg_x = negative_tweets[:train_n]
test_pos_x = positive_tweets[train_n:]
test_neg_x = negative_tweets[train_n:]

train_x = train_pos_x + train_neg_x
test_x = test_pos_x + test_neg_x

train_y = np.append(np.ones((train_n, 1)), np.zeros((train_n, 1)), axis=0)
test_y = np. append(np.ones((test_n, 1)), np.zeros((test_n, 1)), axis=0)

In [4]:
# sigmoid funciton
def sigmoid(z):
    h = 1.0 / (1.0 + np.exp(-z))
    return h

In [29]:
# gradient descent
def gradient_descent(x, y, theta, alpha, num_iter):
    """Train log. regression via gradien descent
    Input:
        x: input vector of dim (m, 3)
        y: labels vector of dim (m, 1)
        theta: parameters of the log reg model
        alpha: learning rate
        num_iter: number of iterations
    """
    J = None
    m = x.shape[0]
    for i in range(num_iter):
        h = sigmoid(np.dot(x, theta))
        gradient = 1. / float(m) * np.dot(x.T, h - y)
        theta = theta - alpha * gradient
        J = - 1. / float(m) * (np.dot(y.T, np.log(h)) + np.dot(1. - y.T, np.log(1. - h)))
        
    J = float(J)
    return J, theta

In [62]:
# test the gradient descent function
np.random.seed(1) 

np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
(np.random.rand(10, 1) > 0.35).astype(float)

tmp_J, tmp_theta = gradient_descent(tmp_x, tmp_y, np.zeros((3, 1)), 1e-8, 700)
print(f"Loss: {tmp_J:.8f}")
print([round(t, 8) for t in np.squeeze(tmp_theta)])

Loss: 0.67094970
[4.1e-07, 0.00035658, 7.309e-05]


In [58]:
def extract_features(tweet, freqs):
    features = np.array([1., 0., 0.])
    for token in process_tweet(tweet):
        features[1] += freqs.get((token, 1.), 0.)
        features[2] += freqs.get((token, 0.), 0.)
        
    return features

In [59]:
# train model
freqs = build_freqs(train_x, train_y)

In [60]:
extract_features(train_x[0], freqs)

array([1.00e+00, 3.02e+03, 6.10e+01])

In [65]:
extract_features('blorb bleeeeb bloooob', freqs)

array([1., 0., 0.])

In [73]:
m = len(train_x)
train_features = np.zeros((m, 3)) #np.append(np.ones((m, 1)), np.zeros((m, 2)), axis=1).reshape((m, 3))
for i, tweet in enumerate(train_x):
    train_features[i, :] = extract_features(tweet, freqs)
loss, theta = gradient_descent(train_features, train_y, np.zeros((3, 1)), 1e-9, 2500)
print(f"Loss: {loss}")
print(theta)

Loss: 0.19017944918474783
[[ 1.22177916e-07]
 [ 7.01963417e-04]
 [-6.94830463e-04]]


In [74]:
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet, freqs)
    y_pred = sigmoid(np.dot(x, theta))
    return y_pred

In [75]:
# Run this cell to test your function
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

I am happy -> 0.525106
I am bad -> 0.493077
this movie should have been great. -> 0.520928
great -> 0.520898
great great -> 0.541723
great great great -> 0.562403
great great great great -> 0.582869


In [78]:
# test logistic regression on the test dataset
def test_logistic_regression(test_x, test_y, freqs, theta):
    y_hat = []
    for tweet in test_x:
        y_pred = predict_tweet(tweet, freqs, theta)
        y_hat.append(1.0 if y_pred>0.5 else 0.0)

    accuracy = np.sum(np.array(y_hat) == np.reshape(test_y, -1)) / test_y.shape[0]
    return accuracy

In [79]:
tmp_acc = test_logistic_regression(test_x, test_y, freqs, theta)
print(tmp_acc)

0.9945


In [81]:
# error analysis of wrongly predicted tweets
for x, y in zip(test_x, test_y):
    y_pred = predict_tweet(x, freqs, theta)
    if np.abs(y - (y_pred>0.5)) > 0:
        print("mispredicted tweet: ", x)

mispredicted tweet:  @MarkBreech Not sure it would be good thing 4 my bottom daring 2 say 2 Miss B but Im gonna be so stubborn on mouth soaping ! #NotHavingit :p
mispredicted tweet:  I'm playing Brain Dots : ) #BrainDots
http://t.co/UGQzOx0huu
mispredicted tweet:  I'm playing Brain Dots : ) #BrainDots http://t.co/aOKldo3GMj http://t.co/xWCM9qyRG5
mispredicted tweet:  I'm playing Brain Dots : ) #BrainDots http://t.co/R2JBO8iNww http://t.co/ow5BBwdEMY
mispredicted tweet:  off to the park to get some sunlight : )
mispredicted tweet:  @msarosh Uff Itna Miss karhy thy ap :p
mispredicted tweet:  @phenomyoutube u probs had more fun with david than me : (
mispredicted tweet:  pats jay : (
mispredicted tweet:  @bae_ts WHATEVER STIL L YOUNG &gt;:-(
mispredicted tweet:  my beloved grandmother : ( https://t.co/wt4oXq5xCf
mispredicted tweet:  @ITVCentral #Midlands Yes thanks for the depressing weather forecast, where the word 'rain' was mentioned several times :-(
