# Sentiment Analysis

In [89]:
import pandas as pd
import numpy as np
import re
import nltk.data
import string
import matplotlib.pyplot as plt
import matplotlib

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim.models import word2vec

### Getting data

In [90]:
data = pd.read_csv('shuffled_movie_data.csv')
data.tail()

Unnamed: 0,review,sentiment
49995,"OK, lets start with the best. the building. al...",0
49996,The British 'heritage film' industry is out of...,0
49997,I don't even know where to begin on this one. ...,0
49998,Richard Tyler is a little boy who is scared of...,0
49999,I waited long to watch this movie. Also becaus...,1


### Test

In [3]:
stop = stopwords.words('english')
porter = PorterStemmer()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    text = [w for w in text.split() if w not in stop]
    tokenized = [porter.stem(w) for w in text]
    return text

In [4]:
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [5]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    for _ in range(size):
        text, label = next(doc_stream)
        docs.append(text)
        y.append(label)
    return docs, y

In [6]:
from sklearn.feature_extraction.text import HashingVectorizer
vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

In [7]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='shuffled_movie_data.csv')

In [8]:
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)

In [9]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.867


### Exercise 1

- Define features based on word embeddings (pre-trained word2vec / Glove/Fastext emebddings can be used)
- Define suitable d dimension, and sequence length

### Processing data

Changing "review" to lower case

In [91]:
data['review'] = data['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data.at[49999,'review']

'i waited long to watch this movie. also because i like bruce willis. the plot was quite different from what i had expected but still quite good. its a good mix of emotions, humor and drama.<br /><br />left me thinking over and again :)'

Getting rid of tags

In [92]:
def ridof_sc(text):
    text = re.sub('<[^>]*>', '', text) 
    return text

In [93]:
data['review'] = data['review'].apply(lambda x: ridof_sc(x))
data.at[49999,'review']

'i waited long to watch this movie. also because i like bruce willis. the plot was quite different from what i had expected but still quite good. its a good mix of emotions, humor and drama.left me thinking over and again :)'

In [94]:
def text2words(text, remove_stop = False):  
    text = re.sub('[!)(#?,.:*";]', ' ', text)
    words = text.split()
    if remove_stop:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
    #print(words)
    return (words)


Defining a sentence tokenizer

In [95]:
#nltk.download('punkt')
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [96]:
# This function splits a text into sentences
def review_sentences(review, remove_stopwords=False):
    raw_sentences = sent_tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(text2words(raw_sentence,remove_stopwords))

    # sentences is a list of lists
    return sentences

Getting the vocabulary for our model 

In [97]:
sentences = []
for review in data['review']:
    sentences += review_sentences(review)
#len(sentences)

Training the model

In [98]:
num_features = 300  # Word vector dimensionality
min_word_count = 30 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

# Initializing the train model
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "300features_50minwords_10context"
model.save(model_name)

In [99]:
#Test model
model.wv.most_similar('movie')

[('film', 0.8535209894180298),
 ('flick', 0.6908314228057861),
 ('movies', 0.5725680589675903),
 ('it', 0.5693177580833435),
 ('picture', 0.515169620513916),
 ('documentary', 0.5139075517654419),
 ('show', 0.4913640022277832),
 ('sequel', 0.49041104316711426),
 ('turkey', 0.4816403388977051),
 ('episode', 0.45707225799560547)]

In [135]:
movie_vec = model.wv['movie']
print(movie_vec)

[-0.12615186 -0.08537848 -0.03888972  0.03662663 -0.02219864  0.00614061
 -0.12951656  0.05743549  0.00446866  0.03129537 -0.01344721 -0.01058722
  0.10094015  0.09555555  0.03855471 -0.00639918 -0.0470705  -0.08892679
  0.11756346 -0.02543444  0.09074035 -0.05586453  0.07222655  0.10098025
 -0.01262713  0.12632525  0.05631389  0.09006497  0.07177544  0.09999133
  0.07672758  0.02365885 -0.0057018   0.08834656  0.00799534  0.06121251
  0.01004557  0.06768848 -0.14478073  0.03759738  0.0674349   0.05043565
  0.00957144 -0.04228305 -0.07052817 -0.1203943   0.07027616  0.05103814
 -0.00483615  0.00354183  0.01838845  0.01985047 -0.15733038 -0.08981951
  0.05898672 -0.01374556 -0.04111585 -0.02021777  0.08237603 -0.00929134
  0.12484758 -0.09824619  0.05074212 -0.03846441  0.04531216 -0.11685055
 -0.00900909 -0.04001853  0.04014616  0.13427156  0.10882665  0.0567045
  0.0664423   0.03988168  0.06460241  0.06295266 -0.17584759 -0.08051222
  0.1160237   0.1505221   0.01571569 -0.0245156   0.

Getting avergae vector for the dataset

In [100]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords += 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [101]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, num_features)
        counter += 1
        
    return reviewFeatureVecs

In [102]:
reviews = []
#for x in range(49997, 50000):
for review in data['review']:
    # to get the vectors, we remove stopwords
    reviews.append(text2words(review, remove_stop=True))
    
DataVecs = getAvgFeatureVecs(reviews, num_features)

Review 0 of 50000


  del sys.path[0]


Review 1000 of 50000
Review 2000 of 50000
Review 3000 of 50000
Review 4000 of 50000
Review 5000 of 50000
Review 6000 of 50000
Review 7000 of 50000
Review 8000 of 50000
Review 9000 of 50000
Review 10000 of 50000
Review 11000 of 50000
Review 12000 of 50000
Review 13000 of 50000
Review 14000 of 50000
Review 15000 of 50000
Review 16000 of 50000
Review 17000 of 50000
Review 18000 of 50000
Review 19000 of 50000
Review 20000 of 50000
Review 21000 of 50000
Review 22000 of 50000
Review 23000 of 50000
Review 24000 of 50000
Review 25000 of 50000
Review 26000 of 50000
Review 27000 of 50000
Review 28000 of 50000
Review 29000 of 50000
Review 30000 of 50000
Review 31000 of 50000
Review 32000 of 50000
Review 33000 of 50000
Review 34000 of 50000
Review 35000 of 50000
Review 36000 of 50000
Review 37000 of 50000
Review 38000 of 50000
Review 39000 of 50000
Review 40000 of 50000
Review 41000 of 50000
Review 42000 of 50000
Review 43000 of 50000
Review 44000 of 50000
Review 45000 of 50000
Review 46000 of 500

Setting train and test sets

In [103]:
DataVecs.shape

(50000, 300)

In [104]:
X = DataVecs.T
X.shape

(300, 50000)

In [105]:
y = data[['sentiment']]
y = y.T
y = y.values
y.shape

(1, 50000)

In [106]:
print(type(X), type(y))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


Building Neural Network

In [107]:
def initialize_with_zeros(D, K):
    """
    This function creates a vector of zeros of shape (dim, 1) for w and initializes b to 0.
    
    Argument:
    dim -- size of the w vector we want (or number of parameters in this case)
    
    Returns:
    W -- initialized vector of shape (D, K)
    b -- initialized scalar (corresponds to the bias) of size K
    """
    
    W = np.zeros((D,K))
    b = np.zeros((K,1))
    
    return W, b

def initialize_randomly(D, K):
    """
    This function creates a vector of zeros of shape (dim, 1) for w and initializes b to 0.
    
    Argument:
    dim -- size of the w vector we want (or number of parameters in this case)
    
    Returns:
    W -- initialized vector of shape (D, K)
    b -- initialized scalar (corresponds to the bias) of size K
    """
    
    W = np.random.randn(D,K)*0.01
    b = np.random.randn(K,1)*0.01
    
    return W, b

def initialize_he(D, K):
    """
    This function creates a vector of zeros of shape (dim, 1) for w and initializes b to 0.
    
    Argument:
    dim -- size of the w vector we want (or number of parameters in this case)
    
    Returns:
    W -- initialized vector of shape (D, K)
    b -- initialized scalar (corresponds to the bias) of size K
    """
    
    W = np.random.randn(D,K)*np.sqrt(2/D)
    b = np.zeros((K,1))
        
    return W, b

def initialize_params(D, K, init_type='zeros'):
    if init_type == 'zeros':
        print('zero-based init')
        return initialize_with_zeros(D, K)
    elif init_type == 'random':
        print('random-based init')
        return initialize_randomly(D, K)
    elif init_type == 'he':
        print('he-based init')
        return initialize_he(D, K)

In [108]:
def sigmoid(z):
    """
    Compute the sigmoid of z

    Arguments:
    z -- A scalar or numpy array of any size.

    Return:
    s -- sigmoid(z)
    """
    
    s = 1/(1+np.exp(-z))    
    
    return s

In [109]:
def compute_cost(Y, Y_hat):
           
    m = Y.shape[1]
    L = -(np.multiply(Y,np.log(Y_hat))+np.multiply((1-Y),np.log(1-Y_hat)))
    L = (1/m)*np.sum(L)
        
    return L

In [110]:
def propagate(W, b, X, y, use_reg=False, reg_lambda=0.01):
    """
    Implement the cost function and its gradient for the propagation explained above

    Arguments:
    w -- weights, a numpy array of size (num_px * num_px * 3, 1)
    b -- bias, a scalar
    X -- data of size (num_px * num_px * 3, number of examples)
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat) of size (1, number of examples)

    Return:
    cost -- negative log-likelihood cost for logistic regression
    dW -- gradient of the loss with respect to w, thus same shape as w
    db -- gradient of the loss with respect to b, thus same shape as b
    
    Tips:
    - Write your code step by step for the propagation. np.log(), np.dot()
    """
    
    #print("Propagate")
    #print("W, b, X, y")
    #print(W.shape, b.shape, X.shape, y.shape)
    
    m = X.shape[1]
    z = np.dot(W.T,X) + b  
    
    #print("m, z = W.T * X + b")
    #print(m, z.shape)
    
    a = sigmoid(z)
    
    #print("a")
    #print(a.shape)
    
    # FORWARD PROPAGATION (FROM X TO COST)
    ### START CODE HERE ### (≈ 2 lines of code)
    cost = compute_cost(y, a)
    
    #print("cost")
    #print(cost)
    ### END CODE HERE ###
    
    if use_reg:
        ### START CODE HERE ### (≈ 2 lines of code)                
        cost = cost + ((reg_lambda/2)*(np.sum(np.power(W,2))))        
        ### END CODE HERE ###
    
    
    # BACKWARD PROPAGATION (TO FIND GRAD)
    ### START CODE HERE ### (≈ 2 lines of code)
    dz = a - y
    
    #print("dz = a - y")
    #print(dz.shape)
    
    dW = (1/m)* np.dot(X,dz.T)
    db = (1/m)* np.sum(dz)
    
    ### END CODE HERE ###
    
    if use_reg:
        ### START CODE HERE ### (≈ 2 lines of code)
        #dW = dW + ((reg_lambda/2)*(np.sum(np.power(dW,2))))
        dW += reg_lambda * W
        ### END CODE HERE ###

    assert(dW.shape == W.shape)
    assert(db.dtype == float)
    cost = np.squeeze(cost)
    assert(cost.shape == ())
    
    grads = {"dW": dW,
             "db": db}
    
    return grads, cost

In [111]:
def optimize(W, b, X, y, num_iterations, learning_rate, use_reg = False, reg_lambda = 0.01, print_cost = False):
    """
    This function optimizes w and b by running a gradient descent algorithm
    
    Arguments:
    w -- weights, a numpy array of size (num_px * num_px * 3, 1)
    b -- bias, a scalar
    X -- data of shape (num_px * num_px * 3, number of examples)
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat), of shape (1, number of examples)
    num_iterations -- number of iterations of the optimization loop
    learning_rate -- learning rate of the gradient descent update rule
    use_reg -- use regularization
    reg_lambda -- regularization weight
    print_cost -- True to print the loss every 100 steps
    
    Returns:
    params -- dictionary containing the weights w and bias b
    grads -- dictionary containing the gradients of the weights and bias with respect to the cost function
    costs -- list of all the costs computed during the optimization, this will be used to plot the learning curve.
    
    Tips:
    You basically need to write down two steps and iterate through them:
        1) Calculate the cost and the gradient for the current parameters. Use propagate().
        2) Update the parameters using gradient descent rule for w and b.
    """
    
    costs = []
    
    for ii in range(num_iterations):
        
        
        # Cost and gradient calculation (≈ 1-4 lines of code)
        ### START CODE HERE ### 
        if use_reg:
            #print("Using Reg")
            grads, cost = propagate(W, b, X, y,True, reg_lambda)
        else:
            grads, cost = propagate(W, b, X, y,False)
        ### END CODE HERE ###
        
        # Retrieve derivatives from grads
        dW = grads["dW"]
        db = grads["db"]
        
        #print("dW, db")
        #print(dW.shape, db.shape)
        
        # update rule (≈ 2 lines of code)
        ### START CODE HERE ###
        W = W - learning_rate*dW
        b = b - learning_rate*db 
        
        ### END CODE HERE ###
        
            
        # Record the costs
        if ii % 100 == 0:
            costs.append(cost)
            
        # Print the cost every 200 training iterations
        if print_cost and ii % 200 == 0:
            print ("Cost after iteration %i: %f" %(ii, cost))
    
    params = {"W": W,
              "b": b}
    
    grads = {"dW": dW,
             "db": db}
    
    return params, grads, costs

In [112]:
def predict(W, b, X):
    '''
    Predict whether the label is 0 or 1 using learned logistic regression parameters (w, b)
    
    Arguments:
    w -- weights, a numpy array of size (num_px * num_px * 3, 1)
    b -- bias, a scalar
    X -- data of size (num_px * num_px * 3, number of examples)
    
    Returns:
    Y_prediction -- a numpy array (vector) containing all predictions (0/1) for the examples in X
    '''
    
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    
    # Compute vector "A" predicting the probabilities of a cat being present in the picture
    ### START CODE HERE ### (≈ 1 line of code)
    z = np.dot(W.T,X) + b    
    A = sigmoid(z)
    
    #print("Predict")
    #print("A")
    #print(A.shape)
    ### END CODE HERE ###
        
    for ii in range(A.shape[1]):
        
        # Convert probabilities A[0,i] to actual predictions p[0,i]
        ### START CODE HERE ### (≈ 4 lines of code)
        if A[0,ii] >= 0.5:
            Y_prediction[0,ii] = 1
        else:
            Y_prediction[0,ii] = 0
        
        ### END CODE HERE ###
    
    assert(Y_prediction.shape == (1, m))
    
    return Y_prediction

In [123]:
def train(X_train, y_train, D=2,K=2, num_iterations=2000, learning_rate=0.5, use_reg=True, reg_lambda=0.01, init_type='zeros', print_cost=False):
    """
    Builds the logistic regression model by calling the function you've implemented previously
    
    Arguments:
    X_train -- training set represented by a numpy array of shape (num_px * num_px * 3, m_train)
    Y_train -- training labels represented by a numpy array (vector) of shape (1, m_train)
    X_test -- test set represented by a numpy array of shape (num_px * num_px * 3, m_test)
    Y_test -- test labels represented by a numpy array (vector) of shape (1, m_test)
    num_iterations -- hyperparameter representing the number of iterations to optimize the parameters
    learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()
    print_cost -- Set to true to print the cost every 100 iterations
    
    Returns:
    d -- dictionary containing information about the model.
    """
    
    
    # initialize parameters with zeros (≈ 1 line of code)
    ### START CODE HERE ### (≈ 1 line of code)
    W,b = initialize_params(D, K, init_type)
    #print(W.shape)
    #print(b.shape)
    #print(W)
    #print(b)
    
    ### END CODE HERE ###

    # Gradient descent (≈ 1 line of code)
    ### START CODE HERE ### (≈ 1 line of code)
    #print(use_reg)
    parameters, grads, costs = optimize(W, b, X_train, y_train, num_iterations, learning_rate,use_reg, reg_lambda,print_cost)
    
    ### END CODE HERE ###
        
    # Retrieve parameters w and b from dictionary "parameters"
    W = parameters["W"]
    b = parameters["b"]
    
    Y_prediction_train = predict(W, b, X_train)
    
    # Print train/test Errors
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - y_train)) * 100))
    
    d = {"costs": costs,
         "W" : W, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return d

In [124]:
# train model here (≈ 1 line of code), e.g. with learning_rate = 0.05
### START CODE HERE ### (≈ 1 line of code)
d =  train(X, y,num_features, 2, 3000, 0.8, False, 0.01, 'he', True)
### END CODE HERE ###
W = d['W']
b = d['b']

he-based init
Cost after iteration 0: 1.387265
Cost after iteration 200: 1.296872
Cost after iteration 400: 1.233559
Cost after iteration 600: 1.186051
Cost after iteration 800: 1.148387
Cost after iteration 1000: 1.117296
Cost after iteration 1200: 1.090877
Cost after iteration 1400: 1.067954
Cost after iteration 1600: 1.047754
Cost after iteration 1800: 1.029741
Cost after iteration 2000: 1.013528
Cost after iteration 2200: 0.998822
Cost after iteration 2400: 0.985399
Cost after iteration 2600: 0.973079
Cost after iteration 2800: 0.961717
train accuracy: 80.684 %


In [None]:
# train model here (≈ 1 line of code), e.g. with learning_rate = 0.05
### START CODE HERE ### (≈ 1 line of code)
d =  train(X, y,num_features, 2, 4000, 0.8, False, 0.01, 'he', True)
### END CODE HERE ###
W = d['W']
b = d['b']

he-based init
Cost after iteration 0: 1.387855
Cost after iteration 200: 1.297979


In [114]:
print(X.shape, y.shape)

(300, 50000) (1, 50000)


In [115]:
XX = X.T
yy = y.T
print(XX.shape,yy.shape)

(50000, 300) (50000, 1)


In [116]:
X_train_, X_valid_, y_train_, y_valid_ = XX[:45000], XX[:5000], yy[:45000], yy[45000:]
print(X_train.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_valid.shape)

(300, 45000)
(300, 5000)
(1, 45000)
(1, 5000)


In [117]:
X_train = X_train_.T
X_valid = X_valid_.T
y_train = y_train_.T
y_valid = y_valid_.T
print(X_train.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_valid.shape)


(300, 45000)
(300, 5000)
(1, 45000)
(1, 5000)


### References

http://nbviewer.jupyter.org/github/rasbt/pattern_classification/blob/master/machine_learning/scikit-learn/outofcore_modelpersistence.ipynb

https://taylorwhitten.github.io/blog/word2vec