In [11]:
pip install nltk



In [12]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [14]:
import numpy as np

Using NLTK for natural language processing

Text processing

In [16]:
import nltk
from nltk.corpus import twitter_samples
positive_tweets =twitter_samples.strings('positive_tweets.json')
negative_tweets =twitter_samples.strings('negative_tweets.json')
example_postive_tweet=positive_tweets[0]
example_negative_tweet=negative_tweets[0]
test_pos = positive_tweets[4000:]
train_pos = positive_tweets[:4000]
test_neg = negative_tweets[4000:]
train_neg = negative_tweets[:4000]
train_x = train_pos + train_neg 
test_x = test_pos + test_neg
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

tokenization and stemming

In [17]:
import re                                  
import string
from nltk.corpus import stopwords          
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer
def text_process(tweet):
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer()
    tweet_tokenized = tokenizer.tokenize(tweet)
    stopwords_english = stopwords.words('english') 
    tweet_processsed=[word for word in tweet_tokenized 
    if word not  in stopwords_english and word not in       
    string.punctuation]
    stemmer = PorterStemmer() 
    tweet_after_stem=[]
    for word in tweet_processsed:
        word=stemmer.stem(word)
        tweet_after_stem.append(word)
    return tweet_after_stem

In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

features extraction

In [22]:
pos_words=[]
for tweet in positive_tweets:
    tweet=text_process(tweet)
    
    for word in tweet:
        
        pos_words.append(word)
freq_pos={}
for word in pos_words:
    if (word,1) not in freq_pos:
        freq_pos[(word,1)]=1
    else:
        freq_pos[(word,1)]=freq_pos[(word,1)]+1
neg_words=[]
for tweet in negative_tweets:
    tweet=text_process(tweet)
    
    for word in tweet:
        
        neg_words.append(word)
freq_neg={}
for word in neg_words:
    if (word,0) not in freq_neg:
        freq_neg[(word,0)]=1
    else:
        freq_neg[(word,0)]=freq_neg[(word,0)]+1
freqs_dict = dict(freq_pos)
freqs_dict.update(freq_neg)

In [24]:
import numpy as np
def features_extraction(tweet, freqs_dict):
    word_l = text_process(tweet)
    x = np.zeros((1, 3))
    x[0,0] = 1 
    for word in word_l:
        try:
            x[0,1] += freqs_dict[(word,1)]
        except:
            x[0,1] += 0
        try: 
            x[0,2] += freqs_dict[(word,0.0)]
        except:
            x[0,2] += 0
    assert(x.shape == (1, 3))
    return x
X = np.zeros((len(train_x), 3))
    
for i in range(len(train_x)):
    
    X[i, :]= features_extraction(train_x[i], freqs_dict)

creating the sentiment classifier

In [25]:
def sigmoid(x): 
    h = 1/(1+np.exp(-x))
    return h

def gradientDescent_algo(x, y, theta, alpha, num_iters):
    m = x.shape[0]
    for i in range(0, num_iters):
        z = np.dot(x,theta)
        h = sigmoid(z)
        J = -1/m*(np.dot(y.T,np.log(h))+np.dot((1-y).T,np.log(1-h)))
        theta = theta-(alpha/m)*np.dot(x.T,h-y)
    J = float(J)
    return J, theta

Training and Evaluating the sentiment classifier

In [26]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= features_extraction(train_x[i], freqs_dict)
Y = train_y
J, theta = gradientDescent_algo(X, Y, np.zeros((3, 1)), 1e-9, 1500)

In [27]:
def predict(tweet, freqs_dict, theta):
    x = features_extraction(tweet,freqs_dict)
    y_pred = sigmoid(np.dot(x,theta))
    return y_pred
def test_accuracy(test_x, test_y, freqs_dict, theta):
    y_hat = []
    for tweet in test_x:
        
        y_pred = predict(tweet, freqs_dict, theta)
        
        if y_pred > 0.5:
           
            y_hat.append(1)
        else:
            
            y_hat.append(0)
    m=len(y_hat)
    y_hat=np.array(y_hat)
    y_hat=y_hat.reshape(m)
    test_y=test_y.reshape(m)
    
    c=y_hat==test_y
    j=0
    for i in c:
        if i==True:
            j=j+1
    accuracy = j/m
    return accuracy
accuracy = test_accuracy(test_x, test_y, freqs_dict, theta)

In [28]:
print(accuracy)

0.984


we get 98 percent accuracy meaning the model is almost perfect

In [29]:
my_tweet = 'This is a ridiculously bright movie. The plot was terrible and I was sad until the ending!'

In [32]:
print(features_extraction(my_tweet,1))

[[1. 0. 0.]]


testing the model

In [36]:
y_hat = predict(my_tweet, freqs_dict, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')



[[0.40496694]]
Negative sentiment
