In [21]:
import nltk, re, string
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pickle

In [22]:
def process_tweet(tweet):
    stemmer = nltk.PorterStemmer()
    stopwords_english = stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = nltk.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean


In [23]:
def build_freqs(tweets, ys):
    yslist = np.squeeze(ys).tolist()

    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs
    

In [24]:
# Checking how the above code works with an example.

tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
res = build_freqs(tweets, ys)
print(res)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}


In [25]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\DIVESH\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DIVESH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [27]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

In [28]:
train_x = train_pos + train_neg
test_x = test_pos +  train_pos

In [29]:
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_neg), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [30]:
freqs = build_freqs(train_x, train_y)

In [31]:
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11339


## Logistic regression

In [32]:
def sigmoid(z):
    zz = np.negative(z)
    h = 1 / (1 + np.exp(zz))
    return h

In [33]:
def gradientDescent(x, y, theta, alpha, num_iters):
    m = x.shape[0]
    for i in range(0, num_iters):
        z = np.dot(x, theta)
        h = sigmoid(z)
        cost = -1. / m*(np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(), np.log(1-h)))
        theta = theta - (alpha/m) * np.dot(x.transpose(), (h-y))
    cost = float(cost)
    return cost, theta

In [34]:
def extract_features(tweet, freqs):
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3))
    x[0, 0] = 1
    
    for word in word_l:
        x[0, 1] += freqs.get((word, 1.0), 0)
        x[0, 2] += freqs.get((word, 0.0), 0)
    assert(x.shape == (1, 3))
    return x


In [35]:
# Try to understand what all these three numbers mean. 
# Usually we get a dataset with a lot of features/columns, here we just have text data.
# Those three numbers are the feature set that we have build using build_freq() and extract_features() function.
# build_freq() builds a dictionary having words as keys and the number of times they have occurred in corpus as values.
# Extract feature takes in sum of these values for positive and negative words, i.e. tmp1[1] and tmp[2]


# How these features will be used to predict in Logistic Regression

# First a hypothesis is build which for our case will be h(x) = b1 + b2*x1 + b3*x2
# here b1 = 1, b2 and b3 are determined by cost and gradient function, x1 and x2 are the positive and negative words feature set.

In [36]:
# Training our Model
x = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    x[i, :] = extract_features(train_x[i], freqs)
y = train_y

j, theta = gradientDescent(x, y, np.zeros((3, 1)), 1e-9, 1500)

In [37]:
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet, freqs)
    y_pred = sigmoid(np.dot(x, theta))
    return y_pred

In [38]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    y_hat = []

    for tweet in test_x:
        y_pred = predict_tweet(tweet, freqs, theta)
        if y_pred > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)
    accuracy = (y_hat == np.squeeze(test_y)).sum() / len(test_x)
    return accuracy

In [39]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

  accuracy = (y_hat == np.squeeze(test_y)).sum() / len(test_x)


AttributeError: 'bool' object has no attribute 'sum'

In [40]:
def pre(sentence):
    yhat = predict_tweet(sentence, freqs, theta)
    if yhat > 0.5:
        return 'Positive sentiment'
    elif yhat == 0:
        return 'Neutral sentiment'
    else:
        return 'negative sentiment'

In [41]:
my_tweet = 'It is so hot today but it is the perfect day for a beach party'

res = pre(my_tweet)
print(res)

Positive sentiment
