# Logistic Regression for Sentiment Analysis

In [3]:
import pandas as pd
import numpy as np
import re
import math
import collections as ct
import codecs
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression

1. Getting data

In [4]:
data = pd.read_csv('shuffled_movie_data.csv')
data.tail()

Unnamed: 0,review,sentiment
49995,"OK, lets start with the best. the building. al...",0
49996,The British 'heritage film' industry is out of...,0
49997,I don't even know where to begin on this one. ...,0
49998,Richard Tyler is a little boy who is scared of...,0
49999,I waited long to watch this movie. Also becaus...,1


2. Processing data & defining features 
(according to https://web.stanford.edu/~jurafsky/slp3/5.pdf)

Changing "review" to lower case

In [5]:
data['review'] = data['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data[['review']].tail()

Unnamed: 0,review
49995,"ok, lets start with the best. the building. al..."
49996,the british 'heritage film' industry is out of...
49997,i don't even know where to begin on this one. ...
49998,richard tyler is a little boy who is scared of...
49999,i waited long to watch this movie. also becaus...


Counting words (log of word_count)

In [6]:
data['word_count'] = data['review'].apply(lambda x: len(str(x).split(" ")))
data[['review','word_count']].tail()

Unnamed: 0,review,word_count
49995,"ok, lets start with the best. the building. al...",232
49996,the british 'heritage film' industry is out of...,275
49997,i don't even know where to begin on this one. ...,123
49998,richard tyler is a little boy who is scared of...,105
49999,i waited long to watch this movie. also becaus...,44


In [7]:
data['log_word_count'] = data['review'].apply(lambda x: math.log(len(str(x).split(" "))))
data[['review','log_word_count']].tail()

Unnamed: 0,review,log_word_count
49995,"ok, lets start with the best. the building. al...",5.446737
49996,the british 'heritage film' industry is out of...,5.616771
49997,i don't even know where to begin on this one. ...,4.812184
49998,richard tyler is a little boy who is scared of...,4.65396
49999,i waited long to watch this movie. also becaus...,3.78419


Looking for "!" (Exclamation marks)

In [8]:
marks = "!"
def getMarksCount(txt):
    cd = {c:val for c, val in ct.Counter(txt).items() if c in marks}
    y = cd.setdefault('!', 0)
    if y > 0:
        return 1
    else:
        return 0    

In [9]:
data['excl_mark_count'] = data['review'].apply(lambda x: getMarksCount(x))
data[['review','excl_mark_count']].tail()

Unnamed: 0,review,excl_mark_count
49995,"ok, lets start with the best. the building. al...",0
49996,the british 'heritage film' industry is out of...,1
49997,i don't even know where to begin on this one. ...,1
49998,richard tyler is a little boy who is scared of...,0
49999,i waited long to watch this movie. also becaus...,0


Removing remaining punctuation

In [10]:
data['review'] = data['review'].str.replace('[^\w\s]','')
data[['review']].tail()

Unnamed: 0,review
49995,ok lets start with the best the building altho...
49996,the british heritage film industry is out of c...
49997,i dont even know where to begin on this one it...
49998,richard tyler is a little boy who is scared of...
49999,i waited long to watch this movie also because...


Lemmatizing (getting root words)

In [11]:
#test
wordnet_lemmatizer = WordNetLemmatizer()

words = []
def lemmatize(txt):
    for word in txt.split():    
        wordnew = wordnet_lemmatizer.lemmatize(word)
        #print(word + " = "+ wordnew)
        words.append(wordnew)
    return words

In [12]:
text = "well not no didn't like dislikes poorest movies actors well-done GG comments worked"
print(lemmatize(text))

['well', 'not', 'no', "didn't", 'like', 'dislike', 'poorest', 'movie', 'actor', 'well-done', 'GG', 'comment', 'worked']


In [13]:
data['review'] = data['review'].apply(lambda x: " ".join(wordnet_lemmatizer.lemmatize(word) for word in x.split()))
data[['review']].tail()

Unnamed: 0,review
49995,ok let start with the best the building althou...
49996,the british heritage film industry is out of c...
49997,i dont even know where to begin on this one it...
49998,richard tyler is a little boy who is scared of...
49999,i waited long to watch this movie also because...


Getting positive and negative words count

In [14]:
def readwords( filename ):
    #f = open(filename)
    with codecs.open(filename, "r",encoding='utf-8', errors='ignore') as fdata:
        fdata = [ line.rstrip("\r\n") for line in fdata]
    return fdata

positive = readwords('OpinionLexicon/positive-words.txt')
#len(positive)

negative = readwords('OpinionLexicon/negative-words.txt')
#len(negative)

In [15]:
for x in range(0,10):
    print(positive[x])

a+
abound
abounds
abundance
abundant
accessable
accessible
acclaim
acclaimed
acclamation


In [16]:
data['positive_count'] = data['review'].apply(lambda x: len([p for p in x.split() if p in positive]))                                            
data[['review','positive_count']].tail()

Unnamed: 0,review,positive_count
49995,ok let start with the best the building althou...,8
49996,the british heritage film industry is out of c...,18
49997,i dont even know where to begin on this one it...,2
49998,richard tyler is a little boy who is scared of...,4
49999,i waited long to watch this movie also because...,4


In [17]:
data['negative_count'] = data['review'].apply(lambda x: len([n for n in x.split() if n in negative]))
data[['review','negative_count']].tail()

Unnamed: 0,review,negative_count
49995,ok let start with the best the building althou...,14
49996,the british heritage film industry is out of c...,17
49997,i dont even know where to begin on this one it...,3
49998,richard tyler is a little boy who is scared of...,3
49999,i waited long to watch this movie also because...,1


In [18]:
data['positive_percent'] = data.positive_count/data.word_count

In [19]:
data['negative_percent'] = data.negative_count/data.word_count

In [20]:
data[['positive_count','negative_count','word_count','positive_percent','negative_percent']].tail()

Unnamed: 0,positive_count,negative_count,word_count,positive_percent,negative_percent
49995,8,14,232,0.034483,0.060345
49996,18,17,275,0.065455,0.061818
49997,2,3,123,0.01626,0.02439
49998,4,3,105,0.038095,0.028571
49999,4,1,44,0.090909,0.022727


Counting 1st and 2nd pronouns 

In [21]:
pronouns = ['i','me','my','mine','myself','you','your','yours','yourself']

In [22]:
data['pronoun_count'] = data['review'].apply(lambda x: len([n for n in x.split() if n in pronouns]))
data[['review','pronoun_count']].tail()

Unnamed: 0,review,pronoun_count
49995,ok let start with the best the building althou...,8
49996,the british heritage film industry is out of c...,1
49997,i dont even know where to begin on this one it...,2
49998,richard tyler is a little boy who is scared of...,0
49999,i waited long to watch this movie also because...,4


In [23]:
data['pronoun_percent'] = data.pronoun_count/data.word_count

Counting NO and variants

In [24]:
no = ["no","not","doesn't","doesnt","don't", "dont","isn't","isnt","hasn't","hasnt","didn't","didnt","won't","wont"]
def getNoCount(txt):
    y = len([n for n in txt.split() if n in no]) 
    #print(y)
    if y > 0:
        return 1
    else:
        return 0  

In [25]:
data['no_count'] = data['review'].apply(lambda x: getNoCount(x))
data[['review','no_count']].tail()

Unnamed: 0,review,no_count
49995,ok let start with the best the building althou...,1
49996,the british heritage film industry is out of c...,1
49997,i dont even know where to begin on this one it...,1
49998,richard tyler is a little boy who is scared of...,1
49999,i waited long to watch this movie also because...,0


In [26]:
data['no_percent'] = data.no_count/data.word_count

Setting final data (train/test)

In [27]:
X = data[['positive_percent','negative_percent','no_percent','pronoun_count','excl_mark_count','log_word_count']]
X.tail()

Unnamed: 0,positive_percent,negative_percent,no_percent,pronoun_count,excl_mark_count,log_word_count
49995,0.034483,0.060345,0.00431,8,0,5.446737
49996,0.065455,0.061818,0.003636,1,1,5.616771
49997,0.01626,0.02439,0.00813,2,1,4.812184
49998,0.038095,0.028571,0.009524,0,0,4.65396
49999,0.090909,0.022727,0.0,4,0,3.78419


In [28]:
X = data[['positive_count','negative_count','no_count','pronoun_count','excl_mark_count','log_word_count']]
X.tail()

Unnamed: 0,positive_count,negative_count,no_count,pronoun_count,excl_mark_count,log_word_count
49995,8,14,1,8,0,5.446737
49996,18,17,1,1,1,5.616771
49997,2,3,1,2,1,4.812184
49998,4,3,1,0,0,4.65396
49999,4,1,0,4,0,3.78419


In [29]:
X.shape

(50000, 6)

In [30]:
y = data['sentiment']
y.shape

(50000,)

In [31]:
X_train, X_test, y_train, y_test = X[:45000], X[45000:], y[:45000], y[45000:]
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(45000, 6)
(5000, 6)
(45000,)
(5000,)


Logistic Regression (sklearn)

In [32]:
logisticRegr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
logisticRegr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [33]:
score = logisticRegr.score(X_test, y_test)
print(score)

0.7246


In [34]:
predictions = logisticRegr.predict(X_test)

In [35]:
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

[[1810  658]
 [ 719 1813]]


In [36]:
type(X_train), type(y_train)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [37]:
Xtrain = np.asmatrix(X_train.values)

In [38]:
ytrain = np.matrix(y_train.tolist())

In [46]:
Xtest = np.asmatrix(X_test.values)

In [39]:
type(Xtrain), type(ytrain)

(numpy.matrixlib.defmatrix.matrix, numpy.matrixlib.defmatrix.matrix)

Logistic Regression (based on sigmoid function)

In [40]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def train(X,Y,alpha=.1,iter=5000):
    W = np.zeros(shape=(1,X.shape[1]))
    b = 1
    for a in range(iter):
        
        # Feed Forward
        z = np.matmul(X,W.T) + b
        a = Yhat = sigmoid(z.T)
        m = a.shape[1]

        # Loss calculation
        loss = -(np.multiply(Y,np.log(Yhat))+np.multiply((1-Y),np.log(1-Yhat)))
        loss = (1/m)*np.sum(loss)
        #print(loss)
        
        # Gradient Decent (get derivatives)
        dz = a-Y
        dw = (1/m)*(X.T@dz.T).T
        db = (1/m)*np.sum(dz)
        
        # Apply changes
        W = W - alpha*dw
        b = b - alpha*db
    return {'W':W,'b':b}

In [41]:
def predict(X, model):    
    predictions = []
    W = model['W']
    b = model['b']

    z = np.matmul(X,W.T) + b
    a = Yhat = sigmoid(z.T)
    m = a.shape[1]
    #print(a)
    for ai in np.matrix.tolist(a)[0]:
        p = 0
        if ai>.5:
            p = 1
        predictions.append(p)
    return predictions

In [42]:
model = train(Xtrain,ytrain)

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


In [44]:
y_pred = predict(X_test, model)

In [45]:
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

[[ 872 1596]
 [ 184 2348]]
