In [1]:
import nltk
from os import getcwd
nltk.download('twitter_samples')
nltk.download('stopwords')
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples


[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\choud\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\choud\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

PREPARING THE DATASET

In [4]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]
train_x=train_pos+train_neg
test_x=test_pos+test_neg
train_y=np.append(np.ones((len(train_pos),1)),np.zeros((len(train_neg),1)),axis=0)
test_y=np.append(np.ones((len(test_pos),1)),np.zeros((len(test_neg),1)),axis=0)
freqs=build_freqs(train_x,train_y)
print('type(freqs) = ' + str(type(freqs)))
print('len(freqs) = ' + str(len(freqs.keys())))


type(freqs) = <class 'dict'>
len(freqs) = 11337


In [5]:
#testing the function
print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0]))

This is an example of a positive tweet: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

This is an example of the processed version of the tweet: 
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [6]:
def sigmoid(z):
    h=1/(1+np.exp(-z))
    return h
if sigmoid(0)==0.5:
    print('SUCCESS')
else:
    print('FAIL')

if sigmoid(4.92)==0.9927537604041685:
    print('SUCCESS')
else:
    print('FAIL')

SUCCESS
SUCCESS


In [7]:
def gradient_descent(x,y,theta,alpha,num_iters):
    m=x.shape[0]
    for i in range(num_iters):
        z=np.dot(x,theta) # X has shape (m,3) and theta is (3,1) so the result is (m,1)
        h=sigmoid(z) # h has shape (m,1)
        J=(-1/m)*(np.dot(y.T,np.log(h))+np.dot((1-y).T,np.log(1-h))) # y.T is (1,m) and np.log(h) is (m,1) so the result is a scalar
        theta=theta-alpha*(1/m)*np.dot(x.T,h-y) # x.T is (3,m) and h-y is (m,1) so the result is (3,1)
    J=float(J)
    return J,theta

In [15]:
#Checking the function with a synthetic test case
np.random.seed(1)
tmp_X=np.append(np.ones((10,1)),np.random.rand(10,2)*2000,axis=1)
tmp_Y=(np.random.rand(10,1)>0.35).astype(float)
tmp_theta=np.random.rand(3,1)
tmp_J,tmp_theta=gradient_descent(tmp_X,tmp_Y,np.zeros((3,1)),1e-8,1000)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is 0.66649173.
The resulting vector of weights is [5.3e-07, 0.00046609, 3.62e-05]


  J=float(J)


EXTRACTING FEATURES

In [20]:
def extract_features(tweet,freqs):
    word_l=process_tweet(tweet)
    x=np.zeros((1,3))
    x[0,0]=1
    for word in word_l:
        x[0,1]+=freqs.get((word,1.0),0)
        x[0,2]+=freqs.get((word,0.0),0)
    assert(x.shape==(1,3))
    return x
tmp1=extract_features(train_x[0],freqs)
print(tmp1)
tmp2=extract_features('blorb bleeeeb bloooob',freqs) #No words in the tweet are in the freqs dictionary
print(tmp2)

[[1.00e+00 3.02e+03 6.10e+01]]
[[1. 0. 0.]]


In [22]:
X=np.zeros((len(train_x),3))
for i in range(len(train_x)):
    X[i,:]=extract_features(train_x[i],freqs)

Y=train_y
J,theta=gradient_descent(X,Y,np.zeros((3,1)),1e-9,1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.24215478.
The resulting vector of weights is [7e-08, 0.00052391, -0.00055517]


  J=float(J)


In [23]:
def predict_tweet(tweet,freqs,theta):
    x=extract_features(tweet,freqs)
    y_pred=sigmoid(np.dot(x,theta))
    return y_pred

for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

I am happy -> 0.518581
I am bad -> 0.494339
this movie should have been great. -> 0.515331
great -> 0.515464
great great -> 0.530899
great great great -> 0.546274
great great great great -> 0.561562


  print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))


In [25]:
tweet='I am learning :)'
predict_tweet(tweet,freqs,theta)

array([[0.8163691]])

In [27]:
def test_logistic_regression(test_x,test_y,freqs,theta):
    y_hat=[]
    for tweet in test_x:
        y_pred=predict_tweet(tweet,freqs,theta)
        if y_pred>0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)
    accuracy=(y_hat==np.squeeze(test_y)).sum()/len(test_y)
    return accuracy

tmp_accuracy=test_logistic_regression(test_x,test_y,freqs,theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


In [29]:
print('Label Predicted Value')
for x,y in zip(test_x,test_y):
    y_hat=predict_tweet(x,freqs,theta)

    if np.abs(y-y_hat)>0.5:
        print('THE TWEET IS:',x)
        print('THE PROCESSED TWEET IS:',process_tweet(x))
        print('%d\t%0.8f' % (y, y_hat))

Label Predicted Value
THE TWEET IS: @jaredNOTsubway @iluvmariah @Bravotv Then that truly is a LATERAL move! Now, we all know the Queen Bee is UPWARD BOUND : ) #MovingOnUp
THE PROCESSED TWEET IS: ['truli', 'later', 'move', 'know', 'queen', 'bee', 'upward', 'bound', 'movingonup']
1	0.49996920
THE TWEET IS: @MarkBreech Not sure it would be good thing 4 my bottom daring 2 say 2 Miss B but Im gonna be so stubborn on mouth soaping ! #NotHavingit :p
THE PROCESSED TWEET IS: ['sure', 'would', 'good', 'thing', '4', 'bottom', 'dare', '2', 'say', '2', 'miss', 'b', 'im', 'gonna', 'stubborn', 'mouth', 'soap', 'nothavingit', ':p']
1	0.48663815
THE TWEET IS: I'm playing Brain Dots : ) #BrainDots
http://t.co/UGQzOx0huu
THE PROCESSED TWEET IS: ["i'm", 'play', 'brain', 'dot', 'braindot']
1	0.48370697
THE TWEET IS: I'm playing Brain Dots : ) #BrainDots http://t.co/aOKldo3GMj http://t.co/xWCM9qyRG5
THE PROCESSED TWEET IS: ["i'm", 'play', 'brain', 'dot', 'braindot']
1	0.48370697


  print('%d\t%0.8f' % (y, y_hat))


THE TWEET IS: I'm playing Brain Dots : ) #BrainDots http://t.co/R2JBO8iNww http://t.co/ow5BBwdEMY
THE PROCESSED TWEET IS: ["i'm", 'play', 'brain', 'dot', 'braindot']
1	0.48370697
THE TWEET IS: off to the park to get some sunlight : )
THE PROCESSED TWEET IS: ['park', 'get', 'sunlight']
1	0.49578796
THE TWEET IS: @msarosh Uff Itna Miss karhy thy ap :p
THE PROCESSED TWEET IS: ['uff', 'itna', 'miss', 'karhi', 'thi', 'ap', ':p']
1	0.48212905
THE TWEET IS: @phenomyoutube u probs had more fun with david than me : (
THE PROCESSED TWEET IS: ['u', 'prob', 'fun', 'david']
0	0.50020391
THE TWEET IS: pats jay : (
THE PROCESSED TWEET IS: ['pat', 'jay']
0	0.50039295
THE TWEET IS: my beloved grandmother : ( https://t.co/wt4oXq5xCf
THE PROCESSED TWEET IS: ['belov', 'grandmoth']
0	0.50000002


PREDICTING MY OWN TWEET


In [33]:
my_tweet='I am sad because I am not learning'
print(process_tweet(my_tweet))
y_hat=predict_tweet(my_tweet,freqs,theta)
print(y_hat)
if y_hat>0.5:
    print('Positive sentiment')
else:
    print('Negative sentiment')

['sad', 'learn']
[[0.48714786]]
Negative sentiment
