In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv( "../data/processed/train_1.csv")
test = pd.read_csv("../data/processed/test_1.csv")
validation = pd.read_csv("../data/processed/validation_1.csv")

In [3]:
from sklearn.model_selection import train_test_split
X = train['review']
y = train['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [4]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(train['review'].apply(lambda x: x.split()), workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-07-16 19:58:42,911 : INFO : 'pattern' package not found; tag filters are not available for English
2018-07-16 19:58:43,024 : INFO : collecting all words and their counts
2018-07-16 19:58:43,027 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-16 19:58:43,065 : INFO : PROGRESS: at sentence #10000, processed 115672 words, keeping 4475 word types
2018-07-16 19:58:43,084 : INFO : collected 5577 word types from a corpus of 183352 raw words and 15918 sentences
2018-07-16 19:58:43,085 : INFO : Loading a fresh vocabulary
2018-07-16 19:58:43,091 : INFO : min_count=40 retains 493 unique words (8% of original 5577, drops 5084)
2018-07-16 19:58:43,092 : INFO : min_count=40 leaves 160490 word corpus (87% of original 183352, drops 22862)
2018-07-16 19:58:43,096 : INFO : deleting the raw counts dictionary of 5577 items
2018-07-16 19:58:43,098 : INFO : sample=0.001 downsamples 82 most-common words
2018-07-16 19:58:43,101 : INFO : downsampling leaves estimated 983

Training model...


2018-07-16 19:58:43,459 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-07-16 19:58:43,463 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-16 19:58:43,465 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-16 19:58:43,488 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-16 19:58:43,489 : INFO : EPOCH - 1 : training on 183352 raw words (98401 effective words) took 0.3s, 293006 effective words/s
2018-07-16 19:58:43,741 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-07-16 19:58:43,744 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-16 19:58:43,755 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-16 19:58:43,771 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-16 19:58:43,772 : INFO : EPOCH - 2 : training on 183352 raw words (98472 effective words) took 0.3s, 389734 effective words/s
2018

In [5]:
model['lazada']

  """Entry point for launching an IPython kernel.


array([-0.0187094 , -0.02739009, -0.01348962,  0.02048195, -0.00734926,
       -0.04537932,  0.11759152, -0.02603204,  0.08739193, -0.00571861,
       -0.04961926,  0.04553705,  0.06315306, -0.06911712, -0.07582539,
        0.00601572,  0.03720866,  0.00787029,  0.03414796,  0.01396811,
       -0.05495732,  0.01952831,  0.01075636,  0.00448649, -0.02663456,
       -0.02846765, -0.02031692,  0.01705346,  0.06069159, -0.0536174 ,
       -0.08966198,  0.0980629 ,  0.08366669,  0.0241677 , -0.02620028,
        0.09934015,  0.02926669,  0.06896932,  0.11054508,  0.0857825 ,
       -0.04380687, -0.05360619,  0.08893705,  0.03680015,  0.00671445,
       -0.06334242, -0.01029712,  0.01846837,  0.0626487 , -0.13476504,
       -0.03411483, -0.02727547,  0.09465968,  0.02500475, -0.02166574,
        0.01463586,  0.03920843,  0.049907  ,  0.01668175,  0.08251037,
       -0.06618717, -0.07928805, -0.14206052, -0.07374915, -0.08426023,
        0.11673465,  0.03485214,  0.05841888, -0.02573732, -0.03

In [6]:
import numpy as np  # Make sure that numpy is imported

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((1,num_features))
    #
    nwords = 0
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0.
    print(type(counter))
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features))
    # 
    # Loop through the reviews
    for review in reviews:
        if counter%1000. == 0.:
            print("Review %d of %d" % (counter, len(reviews)))
        reviewFeatureVecs[int(counter)] = makeFeatureVec(review, model,num_features)
        counter = counter + 1.
    return reviewFeatureVecs

In [7]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.

clean_train_reviews = X_train.apply(lambda x: x.split())

train_data_features = getAvgFeatureVecs(clean_train_reviews, model, num_features )

print("Creating average feature vecs for test reviews")
clean_test_reviews = X_test.apply(lambda x:x.split())

test_data_features = getAvgFeatureVecs(clean_test_reviews, model, num_features )



<class 'float'>
Review 0 of 12734
Review 1000 of 12734
Review 2000 of 12734
Review 3000 of 12734
Review 4000 of 12734
Review 5000 of 12734
Review 6000 of 12734
Review 7000 of 12734
Review 8000 of 12734
Review 9000 of 12734
Review 10000 of 12734
Review 11000 of 12734
Review 12000 of 12734
Creating average feature vecs for test reviews
<class 'float'>
Review 0 of 3184
Review 1000 of 3184
Review 2000 of 3184
Review 3000 of 3184
