In [4]:
import pandas as pd

In [5]:
from nltk.corpus import stopwords

In [6]:
df = pd.read_csv('row_cleaned.csv', index_col = 0)

In [7]:
df.head()

Unnamed: 0_level_0,review,sentiment
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1974 teenager martha moxley maggie grace move...,1
1,ok so really like kris kristofferson usual eas...,0
2,spoiler do not read this if you think about wa...,0
3,hi for all people who have seen this wonderful...,1
4,recently bought dvd forgetting just how much ...,0


In [8]:
# This function converts a text to a sequence of words.
def review_wordlist(review, remove_stopwords=False):
    # 1. Removing html tags
    review_text = BeautifulSoup(review).get_text()
    # 2. Removing non-letter.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return(words)

In [11]:
import nltk.data
#nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/erick/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [13]:


# This function splits a review into sentences
def review_sentences(review, tokenizer, remove_stopwords=False):
    # 1. Using nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    # 2. Loop for each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))

    # This returns the list of lists
    return sentences



In [14]:
X = df['review'].values
Y = df['sentiment'].values
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 20)

In [15]:
train = pd.DataFrame({'review': X_train, 'sentiment':Y_train})
test = pd.DataFrame({'review': X_test, 'sentiment':Y_test})

In [16]:
train.head()

Unnamed: 0,review,sentiment
0,im large scarred heterosexual male ex bouncer ...,1
1,watched this movie about six years ago recent...,0
2,obviously it seems many people really enjoyed ...,0
3,whats happening rgv he seems repeat himself ev...,0
4,have seen poor movies time but this really ta...,0


In [17]:
from bs4 import BeautifulSoup 
import re

sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_sentences(review, tokenizer)

Parsing sentences from training set




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [18]:
len(sentences)

35000

In [19]:


# Importing the built-in logging module
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [21]:
# Creating the model and setting values for the various parameters
num_features = 300  # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

# Initializing the train model
from gensim.models import word2vec
print("Training model....")
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-11-08 16:39:41,724 : INFO : 'pattern' package not found; tag filters are not available for English
2018-11-08 16:39:41,739 : INFO : collecting all words and their counts
2018-11-08 16:39:41,742 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model....


2018-11-08 16:39:42,275 : INFO : PROGRESS: at sentence #10000, processed 1740785 words, keeping 59796 word types
2018-11-08 16:39:42,649 : INFO : PROGRESS: at sentence #20000, processed 3455216 words, keeping 83128 word types
2018-11-08 16:39:43,020 : INFO : PROGRESS: at sentence #30000, processed 5202230 words, keeping 101327 word types
2018-11-08 16:39:43,197 : INFO : collected 109191 word types from a corpus of 6054816 raw words and 35000 sentences
2018-11-08 16:39:43,198 : INFO : Loading a fresh vocabulary
2018-11-08 16:39:43,475 : INFO : effective_min_count=40 retains 10220 unique words (9% of original 109191, drops 98971)
2018-11-08 16:39:43,475 : INFO : effective_min_count=40 leaves 5600211 word corpus (92% of original 6054816, drops 454605)
2018-11-08 16:39:43,510 : INFO : deleting the raw counts dictionary of 109191 items
2018-11-08 16:39:43,514 : INFO : sample=0.001 downsamples 52 most-common words
2018-11-08 16:39:43,514 : INFO : downsampling leaves estimated 4847355 word co

In [22]:
model.wv.most_similar('man')

  if np.issubdtype(vec.dtype, np.int):


[('woman', 0.6021546125411987),
 ('mans', 0.5838137865066528),
 ('boy', 0.5711212158203125),
 ('lady', 0.5195755958557129),
 ('guy', 0.5087496638298035),
 ('himself', 0.5019849538803101),
 ('soldier', 0.48895955085754395),
 ('priest', 0.474894642829895),
 ('person', 0.45540571212768555),
 ('businessman', 0.4490589499473572)]

In [23]:
model.wv.most_similar('awful')

  if np.issubdtype(vec.dtype, np.int):


[('terrible', 0.853543758392334),
 ('dreadful', 0.8195064067840576),
 ('horrible', 0.797717809677124),
 ('lousy', 0.7493232488632202),
 ('atrocious', 0.7393767833709717),
 ('horrendous', 0.7159844636917114),
 ('pathetic', 0.7141166925430298),
 ('bad', 0.6993353962898254),
 ('horrid', 0.6980143785476685),
 ('abysmal', 0.688806414604187)]

In [24]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(10220, 300)

In [25]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [26]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [28]:
import numpy as np
# Calculating average feature vector for training set
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=True))
    
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Review 0 of 35000


  del sys.path[0]


Review 1000 of 35000
Review 2000 of 35000
Review 3000 of 35000
Review 4000 of 35000
Review 5000 of 35000
Review 6000 of 35000
Review 7000 of 35000
Review 8000 of 35000
Review 9000 of 35000
Review 10000 of 35000
Review 11000 of 35000
Review 12000 of 35000
Review 13000 of 35000
Review 14000 of 35000
Review 15000 of 35000
Review 16000 of 35000
Review 17000 of 35000
Review 18000 of 35000
Review 19000 of 35000
Review 20000 of 35000
Review 21000 of 35000
Review 22000 of 35000
Review 23000 of 35000
Review 24000 of 35000
Review 25000 of 35000
Review 26000 of 35000
Review 27000 of 35000
Review 28000 of 35000
Review 29000 of 35000
Review 30000 of 35000
Review 31000 of 35000
Review 32000 of 35000
Review 33000 of 35000
Review 34000 of 35000


In [29]:
# Calculating average feature vector for testing set
clean_test_reviews = []
for review in test['review']:
    clean_test_reviews.append(review_wordlist(review, remove_stopwords=True))
    
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Review 0 of 15000


  del sys.path[0]


Review 1000 of 15000
Review 2000 of 15000
Review 3000 of 15000
Review 4000 of 15000
Review 5000 of 15000
Review 6000 of 15000
Review 7000 of 15000
Review 8000 of 15000
Review 9000 of 15000
Review 10000 of 15000
Review 11000 of 15000
Review 12000 of 15000
Review 13000 of 15000
Review 14000 of 15000


In [30]:
# Fitting a random forest classifier to the training data
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
    
print("Fitting random forest to training data....")    
forest = forest.fit(trainDataVecs, train["sentiment"])

  from numpy.core.umath_tests import inner1d


Fitting random forest to training data....


In [32]:
result = forest.predict(testDataVecs)

In [33]:
Y_ = test['sentiment'].values

In [34]:
Y_ = Y_.reshape(Y_.shape[0],1)

In [35]:
result = result.reshape(result.shape[0],1)

In [37]:
result

array([[0],
       [0],
       [0],
       ...,
       [1],
       [0],
       [1]])

In [38]:
def score(y1, y2):
    le = y1.shape[0]
    if le == y2.shape[0]:
        er = y1 - y2
        er = er*er
        toter = np.sum(er)
        return (le - toter)/le
    else:
        print('Input must be the same dimenssion')

In [39]:
score(result, Y_)

0.8451333333333333

In [40]:
result_tr = forest.predict(trainDataVecs)

In [41]:
Y_tr = train['sentiment'].values

In [42]:
Y_tr = Y_tr.reshape(Y_tr.shape[0],1)

In [44]:
score(result_tr, Y_tr)

KeyboardInterrupt: 