In [77]:
# importing all the libraries
import nltk
from os import getcwd
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import string

In [78]:
# sets of positive and negative tweets
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

In [79]:
# train-test spilt of 80% train set and 20% test set
test_pos = pos_tweets[4000:]
train_pos = pos_tweets[:4000]
test_neg = neg_tweets[4000:]
train_neg = neg_tweets[:4000]

In [80]:
# combining positive and negative tweets for train and test sets
X_train = train_pos + train_neg 
X_test = test_pos + test_neg

In [81]:
# creating labels
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [82]:
print(y_train.shape)
print(y_test.shape)

(8000, 1)
(2000, 1)


In [83]:
def process_tweet(tweet):

    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags by only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    processed_tweets = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            processed_tweets.append(stem_word)

    return processed_tweets

In [85]:
y_train_list = np.squeeze(y_train).tolist() # numpy array to list

# create a dictionary of frequenies mapping each (word, sentiment) pair to itsfrequency
freqs = {}
for y, tweet in zip(y_train_list, X_train):
    for word in process_tweet(tweet):
        pair = (word, y)
        if pair in freqs:
            freqs[pair] += 1
        else:
            freqs[pair] = 1


In [86]:
def sigmoid(z): 
    h = 1 / (1 + np.exp(-z))    
    return h

In [87]:
def gradientDescent(x, y, theta, alpha, num_iters):
    
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # dot product of x and theta
        z = np.dot(x,theta)
        
        # sigmoid of z
        h = sigmoid(z)
        
        # cost function
        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))    

        # update the weights of theta
        theta = theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))
        
    J = float(J)
    return J, theta

In [88]:
def extract_features(tweet, freqs):

    # process tweets
    word_l = process_tweet(tweet)
    
    x = np.zeros((1, 3)) 
    
    #set bias term to 1
    x[0,0] = 1 
        
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
        
        #print(x)
    return x

In [89]:
X = np.zeros((len(X_train), 3))
for i in range(len(X_train)):
    X[i, :]= extract_features(X_train[i], freqs)

# Applying gradient descent
J, theta = gradientDescent(X, y_train, np.zeros((3, 1)), 1e-9, 2000)
print(f"Cost after training: {J:.8f}.")
print(f"Resulting vector of weights: {[round(t, 8) for t in np.squeeze(theta)]}")

Cost after training: 0.21085715.
Resulting vector of weights: [1e-07, 0.00062146, -0.00063298]


In [90]:
def predict_tweet(tweet, freqs, theta):
    
    x = extract_features(tweet,freqs)
    
    # prediction using x and theta
    y_pred = sigmoid(np.dot(x,theta))
        
    return y_pred

In [91]:
def test_LR(X_test, y_test, freqs, theta):

    y_hat = []
    
    for tweet in test_x:
        # predict label
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)

    acc = (y_hat==np.squeeze(y_test)).sum()/len(X_test)
    
    return acc

In [92]:
accuracy = test_LR(X_test, y_test, freqs, theta)
print(f"LR model's accuracy = {accuracy:.8f}")

LR model's accuracy = 0.99550000


In [94]:
my_tweet = 'Today is a beautiful day! :)'
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['today', 'beauti', 'day', ':)']
[[0.86193693]]
Positive sentiment
