In [1]:
# Preprocessing data or cleaning!
# Autor: Erick Tornero
# Topic: Sentiment prediction, Word Embedding, Back-propagation 

In [2]:
import pandas as pd
import pyprind

# Definición de función de procesado de texto:

Esta funciona ayuda en la limpieza de cada **review**, eliminando los siguientes caracteres del texto y dejando solo las palabras en minúscula: 

* [., :, ;, ', ", (, ), [, ]]

Reemplaza los siguientes caracteres por espacios:
* <br ../> <br\ .../> , -, /

In [3]:
# Return a lower case proccesed text
def processtext(texto):
    import re
    REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\')|(\?)|(\,)|(\")|(\!)|(\()|(\))|(\[)|(\])|(\n)")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    texto = REPLACE_NO_SPACE.sub('', texto.lower())
    texto = REPLACE_WITH_SPACE.sub(' ', texto)
    return texto

In [4]:
# Delete Unnecessary characters!, this is pronouns and
# Other irrelevand words see more bellow
def deleteUnusefull(texto):
    # Remove html
    from bs4 import BeautifulSoup 
    texto = review_text = BeautifulSoup(texto).get_text()
    import re
    varss = [r'\bi\b',r'\ba\b',r'\bor\b',r'\bthe\b',r'\bme\b',r'\bthey\b', r'\bmy\b',r'\bis\b',r'\bto\b',r'\bof\b',r'\bby\b',r'\bin\b',r'\bon\b',r'\band\b',r'\bwith\b',r'\bhis\b',r'\bher\b',r'\*',r'\$']
    for patt in varss:
        texto = re.sub(patt, '',texto)
    # Removing non alphabetic letters
    texto = re.sub("[^a-zA-Z]"," ",texto)
    
    texto = re.sub('   ',' ', texto)
    texto = re.sub('  ', ' ', texto)
    if texto[0] == ' ':
        texto = texto[1:]
    if texto[-1] == ' ':
        texto = texto[:-1]
    return texto

## Clean data

In [5]:
df = pd.read_csv('shuffled_movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [36]:
# Get a dataframe called *newdf*
# Here is extracted all features of all Reviews
# Aproximately five minutes to proccess
# All the review is considered as a just one sentence
pbar = pyprind.ProgBar(df.shape[0])
newdf = pd.DataFrame(columns=['review', 'sentiment'])
for _, row in df.iterrows():
    texto = row['review']
    sent = row['sentiment']
    texto = processtext(texto)
    texto = deleteUnusefull(texto)
    newdf = newdf.append({'review':texto, 'sentiment':sent}, ignore_index=True)
    pbar.update()
newdf.index.name = 'Id'



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:04:59


In [38]:
# Here you can see that all the reviews are cleanned
# All in lower case, and unseful words was removed
newdf.head()

Unnamed: 0_level_0,review,sentiment
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,teenager martha moxley maggie grace moves hig...,1
1,ok so really like kris kristofferson usual eas...,0
2,spoiler do not read this if you think about wa...,0
3,hi for all people who have seen this wonderful...,1
4,recently bought dvd forgetting just how much h...,0


In [50]:
newdf.to_csv('textcleaned.csv')

In [6]:
#newdf = pd.read_csv('textcleaned.csv', index_col = 0)

In [8]:
# Get all the sentences, consider that a sentence is a complete review
sentences = []
for text in newdf['review']:
    sentences.append(text.split())

In [9]:
len(sentences)

50000

# Train own Word2vect

Train Word2Vect, that rely on the words of our dataset

In [10]:
SZ_EMB_WORD = 100

In [11]:
# workers: Threads, depends of the pc in this case 4.
# Size: Size of the vector: we'll test with 100
from gensim.models import word2vec
modelW2V = word2vec.Word2Vec(sentences, workers= 4,size=SZ_EMB_WORD,min_count=20,window=20)
modelW2V.init_sims(replace=True)
modelW2V.save('modelreviewfilms')

In [12]:
# Test some similar word2word
modelW2V.wv.most_similar('excellent')

  if np.issubdtype(vec.dtype, np.int):


[('outstanding', 0.8567633628845215),
 ('exceptional', 0.8100523352622986),
 ('superb', 0.7611526250839233),
 ('fantastic', 0.7236974239349365),
 ('terrific', 0.7225155830383301),
 ('brilliant', 0.6885412931442261),
 ('great', 0.6859533786773682),
 ('fine', 0.6806014776229858),
 ('ensemble', 0.6802557706832886),
 ('amazing', 0.6654784679412842)]

We can se the the words more similar to *excellent* are sinonims to this word

In [13]:
# Each vector of each word correspond to a vector of 100x1
modelW2V.wv.get_vector('excellent').shape

(100,)

# Split into train & test data

In [14]:
X = newdf['review'].values
Y = newdf['sentiment'].values
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 20)

In [15]:
import numpy as np

In [16]:
count = 0
for xx in X_train:
    count += 1
print(count)

35000


In [17]:
# Get all the reviews in a single list
reviews = []
for review in X_train:
    reviews.append(review.split())

# Calculate the vector of the reviews
i = 0
X_trainVect = np.zeros((1, SZ_EMB_WORD), dtype='float16')
pbar = pyprind.ProgBar(len(reviews))
for review in reviews:
    feat = np.zeros(SZ_EMB_WORD, dtype='float16')
    ind = set(modelW2V.wv.index2word)
    n = 0
    for word in review:
        if word in ind:
            n += 1
            feat = np.add(feat, modelW2V[word])
    feat = np.divide(X_trainVect, n)
    #print(X_trainVect[i,:].shape)
    X_trainVect = np.append(X_trainVect,feat, axis = 0)
    i+=1
    pbar.update()



MemoryError: 

In [None]:
p