### Twitter sentiment analysis

### using simple liner models

In [11]:
import nltk
import math
from os import getcwd
import pandas as pd
import re
import string
from nltk.corpus import twitter_samples
import matplotlib.pyplot as plt
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


## Preprocess function, i have shown how i made this function in my nltk_quick start repo

In [3]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

## will be using bag of frequency approach to design our features

In [2]:
def build_freqs(tweets, yslist):
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

## Designing features
1) bias
2) positive words frequency count
3) negative words frequency count

In [10]:
positive_samples = twitter_samples.strings("positive_tweets.json")
negative_samples = twitter_samples.strings("negative_tweets.json")
tweets = positive_samples + negative_samples
train_pos = positive_samples[:4000]
train_neg = negative_samples[:4000]
test_pos = positive_samples[4000:]
test_neg = negative_samples[4000:]
train_x = train_pos + train_neg
test_x = test_pos + test_neg
train_y = [1] * len(train_pos)  + [0] * len(train_neg)
test_y = [1] * len(test_pos)  + [0] * len(test_neg)

Number of tweets in train x 8000


In [40]:
freqs = build_freqs(train_x, train_y)
def sigmoid(z): 
    h = 1/(1 + np.exp(-(z)))
    return h
def gradientDescent(x, y, theta, alpha, num_iters):
    m = x.shape[0]
    for i in range(0, num_iters):
        z = np.dot(x, theta)
        h = sigmoid(z)
        J = (np.sum(-1*((y*np.log(h)) + ((1 - y)*np.log(1-h)))))/m
        theta -= (alpha/m) * np.dot(np.transpose(x), (h-y))
    J = float(J)
    return J, theta

In [25]:
def extract_features(tweet, freqs, process_tweet=process_tweet):
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3)) 
    x[0,0] = 1 
    for word in word_l:
        x[0,1] += freqs[(word, 1.0)] if (word, 1.0) in freqs else 0
        x[0,2] += freqs[(word, 0.0)] if (word, 0.0) in freqs else 0
    return x

In [41]:
train_y = np.reshape(np.array(train_y),(-1,1))
test_y = np.reshape(np.array(test_y),(-1,1))
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)
Y = train_y
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.22521397.
The resulting vector of weights is [6e-08, 0.0005382, -0.0005583]


In [45]:
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet, freqs)
    y_pred = sigmoid(np.dot(x, theta))
    return y_pred

In [46]:
my_tweet = 'I am learning :)'
predict_tweet(my_tweet, freqs, theta)

array([[0.83110794]])

In [47]:
def test_logistic_regression(test_x, test_y, freqs, theta, predict_tweet=predict_tweet):
    y_hat = []
    for tweet in test_x:
        y_pred = predict_tweet(tweet, freqs, theta)
        if y_pred > 0.5:
            y_hat.append(1.0)
        else:
            y_hat.append(0.0)
    accuracy = np.sum(np.reshape(np.array(y_hat),(-1,1)) == test_y)/len(test_x)    
    return accuracy

In [51]:
acc = test_logistic_regression(test_x, test_y,freqs, theta)
print("test set accuracy : "+ str(acc))

test set accuracy : 0.995
