In [23]:
import pandas as pd
import numpy as np

In [24]:
train = pd.read_csv( "../data/processed/train_1.csv")
test = pd.read_csv("../data/processed/test_1.csv")
validation = pd.read_csv("../data/processed/validation_1.csv")

In [25]:
from sklearn.model_selection import train_test_split
X = train['review']
y = train['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [26]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(train['review'].apply(lambda x: x.split()), workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-07-11 21:45:11,998 : INFO : collecting all words and their counts
2018-07-11 21:45:12,000 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-11 21:45:12,031 : INFO : PROGRESS: at sentence #10000, processed 115672 words, keeping 4475 word types
2018-07-11 21:45:12,043 : INFO : collected 5577 word types from a corpus of 183352 raw words and 15918 sentences
2018-07-11 21:45:12,044 : INFO : Loading a fresh vocabulary
2018-07-11 21:45:12,048 : INFO : min_count=40 retains 493 unique words (8% of original 5577, drops 5084)
2018-07-11 21:45:12,049 : INFO : min_count=40 leaves 160490 word corpus (87% of original 183352, drops 22862)
2018-07-11 21:45:12,052 : INFO : deleting the raw counts dictionary of 5577 items
2018-07-11 21:45:12,053 : INFO : sample=0.001 downsamples 82 most-common words
2018-07-11 21:45:12,054 : INFO : downsampling leaves estimated 98389 word corpus (61.3% of prior 160490)
2018-07-11 21:45:12,055 : INFO : estimated required memory for 49

Training model...


2018-07-11 21:45:12,231 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-07-11 21:45:12,235 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-11 21:45:12,236 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-11 21:45:12,238 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-11 21:45:12,240 : INFO : EPOCH - 1 : training on 183352 raw words (98417 effective words) took 0.2s, 649513 effective words/s
2018-07-11 21:45:12,386 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-07-11 21:45:12,392 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-11 21:45:12,393 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-11 21:45:12,399 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-11 21:45:12,400 : INFO : EPOCH - 2 : training on 183352 raw words (98735 effective words) took 0.1s, 678057 effective words/s
2018

In [28]:
model.wv.most_similar("baik")

[('kondisi', 0.9171394109725952),
 ('mulus', 0.8731058835983276),
 ('diterima', 0.8697145581245422),
 ('dan', 0.8541863560676575),
 ('keadaan', 0.8541389107704163),
 ('kemasan', 0.8493998050689697),
 ('aman', 0.8320677876472473),
 ('tanpa', 0.821256160736084),
 ('dalam', 0.811029851436615),
 ('telah', 0.806278645992279)]

In [29]:
import numpy as np  # Make sure that numpy is imported

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0.
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 1000th review
       if counter%1000. == 0.:
           print("Review %d of %d".format(counter, len(reviews)))
       # 
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
           num_features)
       #
       # Increment the counter
       counter = counter + 1.
    return reviewFeatureVecs

In [30]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.

clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

NameError: name 'review_to_wordlist' is not defined

In [4]:
# print("Creating tfidf..")
# from sklearn.feature_extraction.text import TfidfVectorizer

# # Initialize the "CountVectorizer" object, which is scikit-learn's
# # bag of words tool.  
# vectorizer = TfidfVectorizer(analyzer = "word",
#                              tokenizer = None, 
#                              preprocessor = None,
#                              stop_words = None, 
#                              max_features = 5000) 

# # fit_transform() does two functions: First, it fits the model
# # and learns the vocabulary; second, it transforms our training data
# # into feature vectors. The input to fit_transform should be a list of 
# # strings.
# %time train_data_features = vectorizer.fit_transform(X_train)

# # Numpy arrays are easy to work with, so convert the result to an 
# # array
# train_data_features = train_data_features.toarray()

Creating tfidf..
CPU times: user 158 ms, sys: 0 ns, total: 158 ms
Wall time: 157 ms


In [5]:
print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
%time forest = forest.fit(train_data_features, y_train)

Training the random forest...
CPU times: user 1min 1s, sys: 133 ms, total: 1min 1s
Wall time: 1min 2s


In [6]:
test_data_features = vectorizer.transform(X_test)
test_data_features = test_data_features.toarray()
pred = forest.predict(test_data_features)

In [7]:
from sklearn.metrics import accuracy_score, log_loss,confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
acc = accuracy_score(y_test,pred)
cm = confusion_matrix(y_test,pred)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))

Accuracy Score: 0.7710427135678392
Confusion Matrix: [[ 579  497]
 [ 232 1876]]


In [8]:
# 1. import
from sklearn.naive_bayes import MultinomialNB

# 2. instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()
%time nb.fit(train_data_features, y_train)
pred = nb.predict(test_data_features)

CPU times: user 145 ms, sys: 3.97 ms, total: 149 ms
Wall time: 148 ms


In [9]:
acc = accuracy_score(y_test, pred)
print("Accuracy Score: " + str(acc))

Accuracy Score: 0.7584798994974874


In [10]:
null_ = []
for i in range(0,len(y_test)):
    null_.append(1)
null_accuracy = accuracy_score(y_test, null_)
print('Null accuracy:', null_accuracy)

Null accuracy: 0.6620603015075377


In [11]:
# 1. import
import lightgbm as lgb

In [12]:
# 2. instantiate a Multinomial Naive Bayes model
lgbm = lgb.LGBMClassifier()
%time lgbm.fit(train_data_features, y_train)
pred = lgbm.predict(test_data_features)
acc = accuracy_score(y_test, pred)
print("Accuracy Score: " + str(acc))

CPU times: user 5.88 s, sys: 27.8 ms, total: 5.91 s
Wall time: 1.6 s
Accuracy Score: 0.7751256281407035


  if diff:


In [13]:
import xgboost as xgb

In [14]:
xgbo = xgb.XGBClassifier()
%time xgbo.fit(train_data_features, y_train)
predic = xgbo.predict(test_data_features)
acc = accuracy_score(y_test, predic)
print("Accuracy Score: " + str(acc))

CPU times: user 1min 29s, sys: 296 ms, total: 1min 29s
Wall time: 1min 29s
Accuracy Score: 0.7550251256281407


  if diff:


In [15]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
%time logistic.fit(train_data_features, y_train)
predic = logistic.predict(test_data_features)
acc = accuracy_score(y_test, predic)
print("Accuracy Score: " + str(acc))

CPU times: user 157 ms, sys: 5 µs, total: 157 ms
Wall time: 157 ms
Accuracy Score: 0.7713567839195979
