In [117]:
import pandas as pd
import numpy as np

In [118]:
train = pd.read_csv( "../data/processed/train_1.csv")
test = pd.read_csv("../data/processed/test_1.csv")
validation = pd.read_csv("../data/processed/validation_1.csv")

In [119]:
from sklearn.model_selection import train_test_split
X = train['review']
y = train['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [120]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(train['review'].apply(lambda x: x.split()), workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-07-13 15:51:49,281 : INFO : collecting all words and their counts
2018-07-13 15:51:49,283 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-13 15:51:49,331 : INFO : PROGRESS: at sentence #10000, processed 114619 words, keeping 4494 word types
2018-07-13 15:51:49,348 : INFO : collected 5249 word types from a corpus of 160338 raw words and 13928 sentences
2018-07-13 15:51:49,349 : INFO : Loading a fresh vocabulary
2018-07-13 15:51:49,354 : INFO : min_count=40 retains 454 unique words (8% of original 5249, drops 4795)
2018-07-13 15:51:49,355 : INFO : min_count=40 leaves 138864 word corpus (86% of original 160338, drops 21474)
2018-07-13 15:51:49,358 : INFO : deleting the raw counts dictionary of 5249 items
2018-07-13 15:51:49,359 : INFO : sample=0.001 downsamples 82 most-common words
2018-07-13 15:51:49,363 : INFO : downsampling leaves estimated 84434 word corpus (60.8% of prior 138864)
2018-07-13 15:51:49,366 : INFO : estimated required memory for 45

Training model...


2018-07-13 15:51:49,556 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-07-13 15:51:49,564 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-13 15:51:49,568 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-13 15:51:49,575 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-13 15:51:49,578 : INFO : EPOCH - 1 : training on 160338 raw words (84315 effective words) took 0.2s, 452542 effective words/s
2018-07-13 15:51:49,711 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-07-13 15:51:49,717 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-13 15:51:49,727 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-13 15:51:49,741 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-13 15:51:49,742 : INFO : EPOCH - 2 : training on 160338 raw words (84394 effective words) took 0.2s, 553388 effective words/s
2018

In [156]:
model['lazada']

  """Entry point for launching an IPython kernel.


array([-0.06006086,  0.00327906, -0.05253839, -0.01203741, -0.02581332,
        0.05838272,  0.00149239, -0.05405663,  0.03555226, -0.01079278,
       -0.09099109,  0.04783284, -0.03617392,  0.04270801,  0.0406545 ,
        0.09556015,  0.10160053,  0.05793571,  0.05568072, -0.04543582,
       -0.09321297,  0.01676505,  0.01188506,  0.00528025, -0.04830434,
        0.06149045,  0.00230221,  0.07318373, -0.02993194, -0.0151897 ,
       -0.14820875, -0.00793065,  0.0791732 ,  0.0776471 , -0.02176563,
        0.05591822, -0.06597566,  0.09938698, -0.00050606, -0.03552577,
       -0.09486289, -0.01141637, -0.09213318, -0.01021278, -0.0276529 ,
       -0.03310291,  0.00365925, -0.08330203,  0.04226304,  0.08726559,
        0.00925407,  0.1445665 , -0.02600041,  0.04782408, -0.06948743,
       -0.03908796,  0.03350278, -0.08524194, -0.04375795, -0.06435312,
        0.00332008, -0.00211642,  0.0893173 , -0.01244638,  0.00173232,
       -0.00682147,  0.0988653 ,  0.00399418,  0.04593932,  0.00

In [194]:
import numpy as np  # Make sure that numpy is imported

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((1,num_features))
    #
    nwords = 0
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0.
    print(type(counter))
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features))
    # 
    # Loop through the reviews
    for review in reviews:
        if counter%1000. == 0.:
            print("Review %d of %d" % (counter, len(reviews)))
        reviewFeatureVecs[int(counter)] = makeFeatureVec(review, model,num_features)
        counter = counter + 1.
    return reviewFeatureVecs

In [195]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.

clean_train_reviews = X_train.apply(lambda x: x.split())

train_data_features = getAvgFeatureVecs(clean_train_reviews, model, num_features )

print("Creating average feature vecs for test reviews")
clean_test_reviews = X_test.apply(lambda x:x.split())

test_data_features = getAvgFeatureVecs(clean_test_reviews, model, num_features )

<class 'float'>
Review 0 of 11142
Review 1000 of 11142




Review 2000 of 11142
Review 3000 of 11142
Review 4000 of 11142
Review 5000 of 11142
Review 6000 of 11142
Review 7000 of 11142
Review 8000 of 11142
Review 9000 of 11142
Review 10000 of 11142
Review 11000 of 11142
Creating average feature vecs for test reviews
<class 'float'>
Review 0 of 2786
Review 1000 of 2786
Review 2000 of 2786


In [196]:
print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
%time forest = forest.fit(train_data_features, y_train)

Training the random forest...


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [6]:
test_data_features = vectorizer.transform(X_test)
test_data_features = test_data_features.toarray()
pred = forest.predict(test_data_features)

In [7]:
from sklearn.metrics import accuracy_score, log_loss,confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
acc = accuracy_score(y_test,pred)
cm = confusion_matrix(y_test,pred)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))

Accuracy Score: 0.7710427135678392
Confusion Matrix: [[ 579  497]
 [ 232 1876]]


In [167]:
# 1. import
from sklearn.naive_bayes import MultinomialNB

# 2. instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()
%time nb.fit(train_data_features, y_train)
pred = nb.predict(test_data_features)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [9]:
acc = accuracy_score(y_test, pred)
print("Accuracy Score: " + str(acc))

Accuracy Score: 0.7584798994974874


In [10]:
null_ = []
for i in range(0,len(y_test)):
    null_.append(1)
null_accuracy = accuracy_score(y_test, null_)
print('Null accuracy:', null_accuracy)

Null accuracy: 0.6620603015075377


In [11]:
# 1. import
import lightgbm as lgb

In [12]:
# 2. instantiate a Multinomial Naive Bayes model
lgbm = lgb.LGBMClassifier()
%time lgbm.fit(train_data_features, y_train)
pred = lgbm.predict(test_data_features)
acc = accuracy_score(y_test, pred)
print("Accuracy Score: " + str(acc))

CPU times: user 5.88 s, sys: 27.8 ms, total: 5.91 s
Wall time: 1.6 s
Accuracy Score: 0.7751256281407035


  if diff:


In [13]:
import xgboost as xgb

In [14]:
xgbo = xgb.XGBClassifier()
%time xgbo.fit(train_data_features, y_train)
predic = xgbo.predict(test_data_features)
acc = accuracy_score(y_test, predic)
print("Accuracy Score: " + str(acc))

CPU times: user 1min 29s, sys: 296 ms, total: 1min 29s
Wall time: 1min 29s
Accuracy Score: 0.7550251256281407


  if diff:


In [15]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
%time logistic.fit(train_data_features, y_train)
predic = logistic.predict(test_data_features)
acc = accuracy_score(y_test, predic)
print("Accuracy Score: " + str(acc))

CPU times: user 157 ms, sys: 5 µs, total: 157 ms
Wall time: 157 ms
Accuracy Score: 0.7713567839195979
