In [1]:
# Preprocessing data or cleaning!
# Autor: Erick Tornero
# Topic: Sentiment prediction, Word Embedding, Back-propagation 

In [2]:
import pandas as pd
import pyprind

# Definición de función de procesado de texto:

Esta funciona ayuda en la limpieza de cada **review**, eliminando los siguientes caracteres del texto y dejando solo las palabras en minúscula: 

* [., :, ;, ', ", (, ), [, ]]

Reemplaza los siguientes caracteres por espacios:
* <br ../> <br\ .../> , -, /

In [4]:
# Return a lower case proccesed text
def processtext(texto):
    import re
    REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\')|(\?)|(\,)|(\")|(\!)|(\()|(\))|(\[)|(\])|(\n)")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    texto = REPLACE_NO_SPACE.sub('', texto.lower())
    texto = REPLACE_WITH_SPACE.sub(' ', texto)
    return texto

In [5]:
# Delete Unnecessary characters!, this is pronouns and
# Other irrelevand words see more bellow
def deleteUnusefull(texto):
    # Remove html
    from bs4 import BeautifulSoup 
    texto = review_text = BeautifulSoup(texto).get_text()
    import re
    varss = [r'\bi\b',r'\ba\b',r'\bor\b',r'\bthe\b',r'\bme\b',r'\bthey\b', r'\bmy\b',r'\bis\b',r'\bto\b',r'\bof\b',r'\bby\b',r'\bin\b',r'\bon\b',r'\band\b',r'\bwith\b',r'\bhis\b',r'\bher\b',r'\*',r'\$']
    for patt in varss:
        texto = re.sub(patt, '',texto)
    # Removing non alphabetic letters
    texto = re.sub("[^a-zA-Z]"," ",texto)
    
    texto = re.sub('   ',' ', texto)
    texto = re.sub('  ', ' ', texto)
    if texto[0] == ' ':
        texto = texto[1:]
    if texto[-1] == ' ':
        texto = texto[:-1]
    return texto

## Clean data

In [7]:
df = pd.read_csv('shuffled_movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [8]:
# Get a dataframe called *newdf*
# Here is extracted all features of all Reviews
# Aproximately five minutes to proccess
# All the review is considered as a just one sentence
pbar = pyprind.ProgBar(df.shape[0])
newdf = pd.DataFrame(columns=['review', 'sentiment'])
for _, row in df.iterrows():
    texto = row['review']
    sent = row['sentiment']
    texto = processtext(texto)
    texto = deleteUnusefull(texto)
    newdf = newdf.append({'review':texto, 'sentiment':sent}, ignore_index=True)
    pbar.update()
newdf.index.name = 'Id'



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:04:13


In [38]:
# Here you can see that all the reviews are cleanned
# All in lower case, and unseful words was removed
newdf.head()

Unnamed: 0_level_0,review,sentiment
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,teenager martha moxley maggie grace moves hig...,1
1,ok so really like kris kristofferson usual eas...,0
2,spoiler do not read this if you think about wa...,0
3,hi for all people who have seen this wonderful...,1
4,recently bought dvd forgetting just how much h...,0


In [50]:
newdf.to_csv('textcleaned.csv')

In [3]:
newdf = pd.read_csv('textcleaned.csv', index_col = 0)

In [4]:
# Get all the sentences, consider that a sentence is a complete review
sentences = []
for text in newdf['review']:
    sentences.append(text.split())

In [5]:
len(sentences)

50000

# Train own Word2vect

Train Word2Vect, that rely on the words of our dataset

In [6]:
SZ_EMB_WORD = 100

In [7]:
# workers: Threads, depends of the pc in this case 4.
# Size: Size of the vector: we'll test with 100
from gensim.models import word2vec
modelW2V = word2vec.Word2Vec(sentences, workers= 4,size=SZ_EMB_WORD,min_count=20,window=20)
modelW2V.init_sims(replace=True)
modelW2V.save('modelreviewfilms')

In [8]:
# Test some similar word2word
modelW2V.wv.most_similar('excellent')

  if np.issubdtype(vec.dtype, np.int):


[('outstanding', 0.8578959107398987),
 ('superb', 0.8096444010734558),
 ('exceptional', 0.7709767818450928),
 ('terrific', 0.7370333075523376),
 ('fantastic', 0.7324416041374207),
 ('great', 0.6926354169845581),
 ('wonderful', 0.6718538999557495),
 ('fine', 0.6699702739715576),
 ('amazing', 0.6678991913795471),
 ('brilliant', 0.6601436138153076)]

We can se the the words more similar to *excellent* are sinonims to this word

In [9]:
# Each vector of each word correspond to a vector of 100x1
modelW2V.wv.get_vector('excellent').shape

(100,)

# Split into train & test data

In [10]:
X = newdf['review'].values
Y = newdf['sentiment'].values
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 20)

In [11]:
import numpy as np

In [12]:
count = 0
for xx in X_train:
    count += 1
print(count)

35000


# Getting the matrix of training

In [19]:
# Get all the reviews in a single list
reviews = []
for review in X_train:
    reviews.append(review.split())

# Calculate the vector of the reviews
i = 0
X_trainVect = np.zeros((1, SZ_EMB_WORD), dtype='float32')
#print(X_trainVect.shape)
pbar = pyprind.ProgBar(len(reviews))
for review in reviews:
    feat = np.zeros(SZ_EMB_WORD, dtype='float32')
    ind = set(modelW2V.wv.index2word)
    n = 0
    for word in review:
        if word in ind:
            n += 1
            feat = np.add(feat, modelW2V[word])
    feat = np.divide(feat, n)
    feat = feat.reshape(1,feat.shape[0])
    #print(feat.shape)
    #print(X_trainVect[i,:].shape)
    X_trainVect = np.append(X_trainVect,feat, axis = 0)
    i+=1
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:40


In [23]:
X_trainVect = X_trainVect[1:,:]

In [25]:
X_trainVect.shape

(35000, 100)

# Getting the matrix of testing

In [27]:
# Get all the reviews in a single list
reviews = []
for review in X_test:
    reviews.append(review.split())

# Calculate the vector of the reviews
i = 0
X_testVect = np.zeros((1, SZ_EMB_WORD), dtype='float32')
#print(X_trainVect.shape)
pbar = pyprind.ProgBar(len(reviews))
for review in reviews:
    feat = np.zeros(SZ_EMB_WORD, dtype='float32')
    ind = set(modelW2V.wv.index2word)
    n = 0
    for word in review:
        if word in ind:
            n += 1
            feat = np.add(feat, modelW2V[word])
    feat = np.divide(feat, n)
    feat = feat.reshape(1,feat.shape[0])
    #print(feat.shape)
    #print(X_trainVect[i,:].shape)
    X_testVect = np.append(X_testVect,feat, axis = 0)
    i+=1
    pbar.update()
X_testVect = X_testVect[1:,:]

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:40


In [28]:
X_testVect.shape

(15000, 100)

# Training

In [29]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100)

forest = forest.fit(X_trainVect, Y_train)

In [31]:
predictions = forest.predict(X_testVect)

In [32]:
def score(y1, y2):
    le = y1.shape[0]
    err = y1-y2
    err = err*err
    ter = np.sum(err)
    return (le - ter)/le

In [37]:
score(predictions, Y_test)

0.8521333333333333

In [75]:
np.save('X_trainV', X_trainVect)
np.save('Y_trainV', Y_train)
np.save('X_testV', X_testVect)
np.save('Y_testV', Y_test)

# Training with own back propagation algorithm

In [84]:
import numpy as np
np.random.seed(20)
class Neuron:
    def __init__(self, NumberInputs, alpha, isInput):
        # self.W = np.zeros(NumberInputs).reshape(NumberInputs, 1)
        if isInput == False:
            self.W = np.random.rand(NumberInputs).reshape(NumberInputs, 1)
            self.bias = np.random.random(1)
            self.NumberInputs = NumberInputs
            self.alpha = alpha
    

    def sigmoid(self, z):
        return 1/(1 + np.exp(-z))

    def GetOutput(self, X):
        #print(X.shape, self.W.shape)
        r = np.matmul(self.W.T, X) + self.bias
        return float(self.sigmoid(r))

    def UpdateWeights(self):
        pass



class Layer:
    def __init__(self, NumberNeurons, NumberInputs, alpha, isInput = False):
        self.Neurons = [Neuron(NumberInputs, alpha, isInput) for i in range(NumberNeurons)]
        self.NumberNeurons = NumberNeurons
        self.isInput = isInput

    def LayerOutput(self, X_in):
        Outs = []
        if self.isInput == False:
            for neuron in self.Neurons:
                Outs.append(neuron.GetOutput(X_in))

            return np.array(Outs).reshape(len(Outs), 1)
        else:
            # Ws = np.array([neuron.W for neuron in self.Neurons])
            # print(Ws)
            return X_in


class NeuralNetwork:
    def __init__(self, d, NumIn, alpha = 0.001):
        self.Layers = [0]*len(d)
        self.alpha = alpha
        for key in d:
            if(key == 0):
                self.Layers[key] = Layer(NumberNeurons = d[key], NumberInputs = NumIn, alpha = alpha, isInput=True)
            elif(key > 0):
                self.Layers[key] = Layer(NumberNeurons = d[key], NumberInputs = d[key -1], alpha = alpha, isInput=False)

    def propagate(self, X_in, justAnswer = False):
        X_t = np.array(0)
        L = []
        for index in range(len(self.Layers)):
            if index == 0:
                X_t = self.Layers[index].LayerOutput(X_in.T)
            else:
                X_t = self.Layers[index].LayerOutput(X_t)
            
            L.append(X_t)
        if justAnswer:
            return X_t
        else:
            return (X_t, L)
    
    def backward(self, Outs, y):
        y_hat = Outs[-1]
        deltaB = np.array(y_hat - y).reshape(1,1)
        delta = np.array(y_hat - y).reshape(1,1)*Outs[-2]
        for i in range(len(Outs) - 1):
            index = -1 - i
            X_in = Outs[index -1].copy()
            
            #curr_Wlen = self.Layers[index].Neurons[0].W.shape[0]
            #currW = np.ones(curr_Wlen).reshape(curr_Wlen, 1)
            #for k in range(self.Layers[index+1].NumberNeurons):
            #    currW = np.append(currW, self.Layers[index+1].Neurons[k].W, axis = 1)
            #currW = np.delete(currW, 0, axis = 1)

            #delta = (y-y_hat)*y_hat*(np.ones(y_hat.shape[0]).reshape(y_hat.shape[0],1) - y_hat)
            #deltaW = X_in*delta.T
            #deltaW.shape
            #for j in range(deltaW.shape[1]):
            #    self.Layers[index].Neurons[j].W = self.Layers[index].Neurons[j].W - self.alpha*deltaW[j]
            
            #print(deltaW, deltaW.shape)
            #delta = (y_hat-y)
            if index != -1:
                wlength = self.Layers[index+1].Neurons[0].W.shape[0]
                Ws = np.ones(wlength).reshape(wlength, 1)
                for k in range(self.Layers[index+1].NumberNeurons):
                    Ws = np.append(Ws, self.Layers[index+1].Neurons[k].W, axis = 1)
                Ws = np.delete(Ws, 0, axis = 1)
                #print(Ws, Ws.shape)
                deltaB = np.sum(deltaB.T * Ws, axis = 1).reshape(Ws.shape[0], 1)
                deltaB = deltaB * Outs[index]
                #print(deltaB, deltaB.shape)
                #print(delta.shape, Ws.shape, Outs[index].shape)
                delta = np.sum(delta*Ws*(1 -Outs[index]), axis = 1).reshape(Ws.shape[0],1)
                delta = delta.T*X_in
                #print('DeltaW>\n',delta, delta.shape)
            for j in range(i):
                pass
                #delta = delta*Outs[-2-j]*(1 - Outs[-2-j])
                
                #print('W info>\n',Ws, Ws.shape)
                #deltaB = deltaB*Outs[index]
                #deltaB = delta*Outs[-2-j]*(1 - Outs[-2-j])*W0
            #delta = X_in*delta.T
            #print(delta.shape)
            for j in range(self.Layers[index].NumberNeurons):
                self.Layers[index].Neurons[j].W = self.Layers[index].Neurons[j].W - delta[:,j].reshape(delta.shape[0],1)*self.alpha
                self.Layers[index].Neurons[j].bias = self.Layers[index].Neurons[j].bias - self.alpha*deltaB[j]

    def train(self, X, Y, ephocs = 10):
        rows, cols = X.shape
        for e in range(ephocs):
            error = 0
            for i in range(rows):
                x = X[i,:].reshape(1, cols)
                y = Y[i].reshape(1,1)
                #print(x.shape)
                y_hat, Outs = self.propagate(x)
                if y_hat >= 0.5:
                    ac = 1
                else:
                    ac = 0
                self.backward(Outs, y)
                error = error + (ac - y)*(ac - y)
            print(e+1, error/rows)
    
    def predict_one(self, X):
        a = self.propagate(X, justAnswer=True)
        if(a > 0.5):
            a = 1
        else:
            a = 0
        return a
    def predictSet(self, X):
        L = []
        for i in range(X.shape[0]):
            L.append(self.predict_one(X[i,:]))
        return np.array(L).reshape(len(L),1)

#dd = {0:4, 1:4, 2:2,3:1}
#NN = NeuralNetwork(dd, 4, 0.05)
#inp = np.ones(4).reshape(1,4)
#ans = NN.propagate(inp)
#print('Answer:>',ans[0], type(ans[0]), ans[0].shape )
#print(ans[1])
#Y = np.array([0])

#NN.backward(ans[1],Y.reshape(1,1))
#xx = 1


In [135]:
# Define the dictionary to initialize the neural network
# Input layer: 100 Neurons - 
# 1 Hidden Layer: 20 Neurons
# Output Layer: 1 Neuron
d = {0: X_trainVect.shape[1], 1: 20, 2: 1}

In [136]:
d

{0: 100, 1: 20, 2: 1}

In [137]:
NN = NeuralNetwork(d, X_trainVect.shape[1], 0.0005)

In [138]:
X_trainVect.shape

(35000, 100)

In [139]:
Y_train = Y_train.reshape(Y_train.shape[0],1)

In [140]:
Y_train.shape

(35000, 1)

In [141]:
NN.train(X_trainVect, Y_train, 100)

1 [[0.4154]]
2 [[0.38874286]]
3 [[0.37648571]]
4 [[0.36431429]]
5 [[0.35548571]]
6 [[0.34514286]]
7 [[0.3362]]
8 [[0.32948571]]
9 [[0.32251429]]
10 [[0.31662857]]
11 [[0.31188571]]
12 [[0.30711429]]
13 [[0.30348571]]
14 [[0.3]]
15 [[0.29737143]]
16 [[0.29537143]]
17 [[0.293]]
18 [[0.29117143]]
19 [[0.2898]]
20 [[0.28811429]]
21 [[0.2864]]
22 [[0.28508571]]
23 [[0.28397143]]
24 [[0.28328571]]
25 [[0.2826]]
26 [[0.28217143]]
27 [[0.28162857]]
28 [[0.28108571]]
29 [[0.28114286]]
30 [[0.28122857]]
31 [[0.28125714]]
32 [[0.28128571]]
33 [[0.28128571]]
34 [[0.28145714]]
35 [[0.28142857]]
36 [[0.28171429]]
37 [[0.28171429]]
38 [[0.28185714]]
39 [[0.28188571]]
40 [[0.2818]]
41 [[0.28171429]]
42 [[0.28162857]]
43 [[0.28177143]]
44 [[0.282]]
45 [[0.28174286]]
46 [[0.28145714]]
47 [[0.28105714]]
48 [[0.28088571]]
49 [[0.28068571]]
50 [[0.28037143]]
51 [[0.28028571]]
52 [[0.28034286]]
53 [[0.28002857]]
54 [[0.28]]
55 [[0.27997143]]
56 [[0.27951429]]
57 [[0.27905714]]
58 [[0.27871429]]
59 [[0.2784]

In [142]:
pred = NN.predictSet(X_testVect)

In [143]:
Y_test.shape

(15000,)

In [144]:
pred.shape

(15000, 1)

In [145]:
score(Y_test.reshape(15000,1), pred)

0.6423333333333333

The $\alpha$ coeficient must be reduce and the ephocs augmented in order to improve the accuracy

* Using RandomForest: Accuraccy> 0.852
* Using my backpropagation algorithm: Acuraccy> 0.6423