### <font color='green'> 1. Description<font>

Sentiment classification using Rotten Tomatoes (Movie) review dataset (binary classification).
Dataset can be downloaded from https://drive.google.com/file/d/1w1TsJB-gmIkZ28d1j7sf1sqcPmHXw352/view
Please dowonload the data manually with your browser and store it to `datasets` directory.
    
The Rotten Tomatoes movie review dataset is a corpus of movie reviews used for sentiment analysis. We will classify a review to be positive ('fresh') or negative ('rotten') on the basis of review text.
Using this dataset, we train a classifier to predict movie rating based on the review text.

### <font color='green'> 2. Data Preprocessing<font>

For RT review classification we will perform some data preparation and data cleaning steps. We will generate feature vectors using sklearn TF-IDF for review text.

In [1]:
import os
import re
import time
import pandas as pd
import numpy as np
from collections import OrderedDict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [2]:
def clean_review(review):
    pattern = re.compile(r'[^a-zA-Z0-9 ]')
    review = pattern.sub(' ', review)
    return review

def document_vector_frovedis(doc, frov_w2v_model, frov_vocab):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    no_embedding = np.zeros(frov_w2v_model.shape[1])
    vocab_doc = [word for word in doc if word in frov_vocab]
    if len(vocab_doc) != 0:
        return list(np.mean(frov_w2v_model.loc[vocab_doc], axis=0))
    else:
        return list(no_embedding)

In [3]:
def create_w2v_embed(df):
    from frovedis.exrpc.server import FrovedisServer
    from frovedis.mllib.feature.w2v import Word2Vec as Frovedis_Word2Vec
    os.environ["VE_OMP_NUM_THREADS"] = '8'
    FrovedisServer.initialize("mpirun -np 1 " + os.environ["FROVEDIS_SERVER"])
    frovedis_w2v = Frovedis_Word2Vec(sentences = list(df["Review"]), hiddenSize=512, minCount=2, n_iter=100)
    X_emb = frovedis_w2v.transform(list(df["Review"]), func = np.mean)
    os.environ["VE_OMP_NUM_THREADS"] = '1'
    FrovedisServer.shut_down()
    return pd.DataFrame(X_emb)

In [4]:
def preprocess_data(fname):
    df = pd.read_csv(fname, encoding="ISO-8859-1")
    df = df.dropna().drop_duplicates().sample(frac=1, random_state=42)
    df['Review'] = df['Review'].str.lower().apply(clean_review)
    mapping = {'fresh': 1, 'rotten': 0}
    df['Freshness'] = df.replace({'Freshness': mapping})
    print("Dataset contains {} reviews".format(df.shape[0]))
    
    stop = stopwords.words('english')
    df['Review'] = df['Review'].apply(lambda x: [item for item in word_tokenize(x) if item not in stop])
    X = create_w2v_embed(df)
    X_train, X_test, y_train, y_test = train_test_split(X, df["Freshness"], random_state=42)
    return X_train, X_test, y_train, y_test

In [5]:
#---- Data Preparation ----
DATA_FILE = "datasets/rt_reviews.csv"
X_train, X_test, y_train, y_test = preprocess_data(DATA_FILE)

print("Generated vector for train data are of shape {}".format(X_train.shape))
print("Generated vector for test data are of shape {}".format(X_test.shape))

Dataset contains 339716 reviews
Generated vector for train data are of shape (254787, 512)
Generated vector for test data are of shape (84929, 512)


### <font color='green'> 3. Algorithm Evaluation<font>

In [6]:
train_time = []
test_time = []
accuracy = []
precision = []
recall = []
f1 = []
estimator_name = []

In [7]:
def evaluate(estimator, estimator_nm, 
             X_train, y_train,
             X_test, y_test):
    estimator_name.append(estimator_nm)
    start_time = time.time()
    estimator.fit(X_train, list(y_train))
    train_time.append(round(time.time() - start_time, 4))

    start_time = time.time()
    pred_y = estimator.predict(X_test)
    test_time.append(round(time.time() - start_time, 4))

    accuracy.append(metrics.accuracy_score(list(y_test), list(pred_y)))
    precision.append(metrics.precision_score(list(y_test), list(pred_y), average='macro'))
    recall.append(metrics.recall_score(list(y_test), list(pred_y), average='macro'))
    f1.append(metrics.f1_score(list(y_test), list(pred_y), average='macro'))

    return metrics.classification_report(list(y_test), list(pred_y))

#### 3.1 Kernel SVC

In [8]:
# Demo: SVC
import frovedis
target = "SVC"
from frovedis.exrpc.server import FrovedisServer
os.environ["VE_OMP_NUM_THREADS"] = "8"
FrovedisServer.initialize("mpirun -np 1 " + os.environ["FROVEDIS_SERVER"])
from frovedis.mllib.svm import SVC as frovSVC

f_est = frovSVC(cache_size=2048, max_iter=10000, gamma=1.0)
e_nm = target + "_frovedis_" + frovedis.__version__
f_report = evaluate(f_est, e_nm, \
                    X_train, y_train, X_test, y_test)
f_est.release()
os.environ["VE_OMP_NUM_THREADS"] = "1"
FrovedisServer.shut_down()

import sklearn
from sklearn.svm import SVC as skSVC
s_est = skSVC(cache_size=2048, max_iter=10000, gamma=1.0)
e_nm = target + "_sklearn_" + sklearn.__version__
s_report = evaluate(s_est, e_nm, \
                    X_train, y_train, X_test, y_test)

# Precision, Recall and F1 score for each class
print("Frovedis SVC metrices: ")
print(f_report)
print("Sklearn SVC metrices: ")
print(s_report)



Frovedis SVC metrices: 
              precision    recall  f1-score   support

           0       0.77      0.75      0.76     38123
           1       0.80      0.82      0.81     46806

    accuracy                           0.79     84929
   macro avg       0.79      0.79      0.79     84929
weighted avg       0.79      0.79      0.79     84929

Sklearn SVC metrices: 
              precision    recall  f1-score   support

           0       0.57      0.78      0.66     38123
           1       0.75      0.53      0.62     46806

    accuracy                           0.64     84929
   macro avg       0.66      0.65      0.64     84929
weighted avg       0.67      0.64      0.64     84929



#### 3.2 Decision Tree

In [9]:
# Demo: DecisionTreeClassifier
import frovedis
target = "decision_tree"
from frovedis.exrpc.server import FrovedisServer
FrovedisServer.initialize("mpirun -np 8 " + os.environ["FROVEDIS_SERVER"])
from frovedis.mllib.tree import DecisionTreeClassifier as frovDecisionTreeClassifier

f_est = frovDecisionTreeClassifier(max_leaf_nodes=2, max_depth=8)
e_nm = target + "_frovedis_" + frovedis.__version__
f_report = evaluate(f_est, e_nm, \
                    X_train, y_train, X_test, y_test)
f_est.release()
FrovedisServer.shut_down()

import sklearn
from sklearn.tree import DecisionTreeClassifier as skDecisionTreeClassifier
s_est = skDecisionTreeClassifier(max_leaf_nodes=2, max_depth=8)
e_nm = target + "_sklearn_" + sklearn.__version__
s_report = evaluate(s_est, e_nm, \
                    X_train, y_train, X_test, y_test)

# Precision, Recall and F1 score for each class
print("Frovedis Decision Tree metrices: ")
print(f_report)
print("Sklearn Decision Tree metrices: ")
print(s_report)

Frovedis Decision Tree metrices: 
              precision    recall  f1-score   support

           0       0.62      0.51      0.56     38123
           1       0.65      0.74      0.69     46806

    accuracy                           0.64     84929
   macro avg       0.64      0.63      0.63     84929
weighted avg       0.64      0.64      0.63     84929

Sklearn Decision Tree metrices: 
              precision    recall  f1-score   support

           0       0.52      0.57      0.54     38123
           1       0.62      0.58      0.60     46806

    accuracy                           0.57     84929
   macro avg       0.57      0.57      0.57     84929
weighted avg       0.58      0.57      0.58     84929



#### 3.3 Random Forest

In [10]:
# Demo: RandomForestClassifier
import frovedis
target = "random_forest"
from frovedis.exrpc.server import FrovedisServer
FrovedisServer.initialize("mpirun -np 8 " + os.environ["FROVEDIS_SERVER"])
from frovedis.mllib.ensemble import RandomForestClassifier as frovRandomForestClassifier

f_est = frovRandomForestClassifier(n_estimators=200, criterion='entropy', max_features=0.5, max_depth=10, \
                                   min_samples_split=5, min_samples_leaf=2)
e_nm = target + "_frovedis_" + frovedis.__version__
f_report = evaluate(f_est, e_nm, \
                    X_train, y_train, X_test, y_test)
f_est.release()
FrovedisServer.shut_down()

from sklearn.ensemble import RandomForestClassifier as skRandomForestClassifier
s_est = skRandomForestClassifier(n_estimators=200, criterion='entropy', max_features=0.5, max_depth=10, \
                                 min_samples_split=5, min_samples_leaf=2, n_jobs=12)
e_nm = target + "_sklearn_" + sklearn.__version__
s_report = evaluate(s_est, e_nm, \
                    X_train, y_train, X_test, y_test)

# Precision, Recall and F1 score for each class
print("Frovedis Random Forest metrices: ")
print(f_report)
print("Sklearn Random Forest metrices: ")
print(s_report)

Frovedis Random Forest metrices: 
              precision    recall  f1-score   support

           0       0.70      0.54      0.61     38123
           1       0.69      0.81      0.74     46806

    accuracy                           0.69     84929
   macro avg       0.69      0.68      0.68     84929
weighted avg       0.69      0.69      0.69     84929

Sklearn Random Forest metrices: 
              precision    recall  f1-score   support

           0       0.70      0.57      0.62     38123
           1       0.69      0.80      0.74     46806

    accuracy                           0.69     84929
   macro avg       0.70      0.68      0.68     84929
weighted avg       0.70      0.69      0.69     84929



#### 3.4 Gradient Boosting Tree

In [11]:
# Demo: GradientBoostingClassifier

target = "gradient_boosting"
from frovedis.exrpc.server import FrovedisServer
FrovedisServer.initialize("mpirun -np 8 " + os.environ["FROVEDIS_SERVER"])
from frovedis.mllib.ensemble.gbtree import GradientBoostingClassifier as frovGBC

f_est = frovGBC(n_estimators=100)
e_nm = target + "_frovedis_" + frovedis.__version__
f_report = evaluate(f_est, e_nm, \
                    X_train, y_train, X_test, y_test)
f_est.release()
FrovedisServer.shut_down()

from sklearn.ensemble import GradientBoostingClassifier as skGBC
s_est = skGBC(n_estimators=100)
e_nm = target + "_sklearn_" + sklearn.__version__
s_report = evaluate(s_est, e_nm, \
                    X_train, y_train, X_test, y_test)

# Precision, Recall and F1 score for each class
print("Frovedis Gradient Boosting metrices: ")
print(f_report)
print("Sklearn Gradient Boosting metrices: ")
print(s_report)

Frovedis Gradient Boosting metrices: 
              precision    recall  f1-score   support

           0       0.72      0.64      0.68     38123
           1       0.73      0.80      0.76     46806

    accuracy                           0.73     84929
   macro avg       0.73      0.72      0.72     84929
weighted avg       0.73      0.73      0.73     84929

Sklearn Gradient Boosting metrices: 
              precision    recall  f1-score   support

           0       0.73      0.62      0.67     38123
           1       0.72      0.81      0.77     46806

    accuracy                           0.73     84929
   macro avg       0.73      0.72      0.72     84929
weighted avg       0.73      0.73      0.72     84929



#### 3.5 Linear SVC

In [12]:
# Demo: Linear SVC

target = "Linear_SVC"
from frovedis.exrpc.server import FrovedisServer
FrovedisServer.initialize("mpirun -np 8 " + os.environ["FROVEDIS_SERVER"])
from frovedis.mllib.svm import LinearSVC as frovSVC

f_est = frovSVC(loss='hinge', max_iter = 60000)
e_nm = target + "_frovedis_" + frovedis.__version__
f_report = evaluate(f_est, e_nm, \
                    X_train, y_train, X_test, y_test)
f_est.release()
FrovedisServer.shut_down()

from sklearn.svm import LinearSVC as skSVC
s_est = skSVC(loss='hinge', max_iter = 60000)
e_nm = target + "_sklearn_" + sklearn.__version__
s_report = evaluate(s_est, e_nm, \
                    X_train, y_train, X_test, y_test)

# Precision, Recall and F1 score for each class
print("Frovedis Linear SVC metrices: ")
print(f_report)
print("Sklearn Linear SVC metrices: ")
print(s_report)

Frovedis Linear SVC metrices: 
              precision    recall  f1-score   support

           0       0.62      0.93      0.74     38123
           1       0.90      0.53      0.67     46806

    accuracy                           0.71     84929
   macro avg       0.76      0.73      0.70     84929
weighted avg       0.77      0.71      0.70     84929

Sklearn Linear SVC metrices: 
              precision    recall  f1-score   support

           0       0.76      0.73      0.74     38123
           1       0.79      0.82      0.80     46806

    accuracy                           0.78     84929
   macro avg       0.77      0.77      0.77     84929
weighted avg       0.78      0.78      0.78     84929



#### 3.6 Bernoulli Naive Bayes

In [13]:
# Demo: Bernoulli Naive Bayes

target = "bernoulli_naive_bayes"
from frovedis.exrpc.server import FrovedisServer
FrovedisServer.initialize("mpirun -np 8 " + os.environ["FROVEDIS_SERVER"])
from frovedis.mllib.naive_bayes import BernoulliNB as frovNB

f_est = frovNB(alpha=1.0)
e_nm = target + "_frovedis_" + frovedis.__version__
f_report = evaluate(f_est, e_nm, \
                    X_train, y_train, X_test, y_test)
f_est.release()
FrovedisServer.shut_down()

from sklearn.naive_bayes import BernoulliNB as skNB
s_est = skNB(alpha=1.0)
e_nm = target + "_sklearn_" + sklearn.__version__
s_report = evaluate(s_est, e_nm, \
                    X_train, y_train, X_test, y_test)
# Precision, Recall and F1 score for each class
print("Frovedis Bernoulli Naive Bayes metrices: ")
print(f_report)
print("Sklearn Bernoulli Naive Bayes metrices: ")
print(s_report)

Frovedis Bernoulli Naive Bayes metrices: 
              precision    recall  f1-score   support

           0       0.69      0.71      0.70     38123
           1       0.76      0.74      0.75     46806

    accuracy                           0.73     84929
   macro avg       0.72      0.72      0.72     84929
weighted avg       0.73      0.73      0.73     84929

Sklearn Bernoulli Naive Bayes metrices: 
              precision    recall  f1-score   support

           0       0.69      0.71      0.70     38123
           1       0.76      0.74      0.75     46806

    accuracy                           0.73     84929
   macro avg       0.72      0.72      0.72     84929
weighted avg       0.73      0.73      0.73     84929



#### 3.7 Nearest Neighbor Classification

In [14]:
# Demo: Nearest Neighbor Classification

target = "nearest_neighbor_classification"
from frovedis.exrpc.server import FrovedisServer
FrovedisServer.initialize("mpirun -np 8 " + os.environ["FROVEDIS_SERVER"])
from frovedis.mllib.neighbors import KNeighborsClassifier as frovKNC

f_est = frovKNC(n_neighbors=3, algorithm='brute', metric='euclidean')
e_nm = target + "_frovedis_" + frovedis.__version__

#we are using first 50k samples inorder to avoid memory issue at Frovedis Server
f_report = evaluate(f_est, e_nm, \
                    X_train[:50000], y_train[:50000], X_test[:50000], y_test[:50000])
f_est.release()
FrovedisServer.shut_down()

from sklearn.neighbors import KNeighborsClassifier as skKNC
s_est = skKNC(n_neighbors=3, algorithm='brute', metric='euclidean', n_jobs=12)
e_nm = target + "_sklearn_" + sklearn.__version__
s_report = evaluate(s_est, e_nm, \
                    X_train[:50000], y_train[:50000], X_test[:50000], y_test[:50000])

# Precision, Recall and F1 score for each class
print("Frovedis KNeighborsClassifier metrices: ")
print(f_report)
print("Sklearn KNeighborsClassifier metrices: ")
print(s_report)

Frovedis KNeighborsClassifier metrices: 
              precision    recall  f1-score   support

           0       0.61      0.55      0.58     22312
           1       0.66      0.71      0.69     27688

    accuracy                           0.64     50000
   macro avg       0.63      0.63      0.63     50000
weighted avg       0.64      0.64      0.64     50000

Sklearn KNeighborsClassifier metrices: 
              precision    recall  f1-score   support

           0       0.61      0.55      0.58     22312
           1       0.66      0.71      0.69     27688

    accuracy                           0.64     50000
   macro avg       0.63      0.63      0.63     50000
weighted avg       0.64      0.64      0.64     50000



#### 3.8 SGDClassifier

In [15]:
# Demo: SGDClassifier

target = "SGDClassifier"
from frovedis.exrpc.server import FrovedisServer
FrovedisServer.initialize("mpirun -np 8 " + os.environ["FROVEDIS_SERVER"])
from frovedis.mllib.linear_model import SGDClassifier as frovSGD

f_est = frovSGD(learning_rate="invscaling", eta0=1.0)
e_nm = target + "_frovedis_" + frovedis.__version__
f_report = evaluate(f_est, e_nm, \
                    X_train, y_train, X_test, y_test)
f_est.release()
FrovedisServer.shut_down()

from sklearn.linear_model import SGDClassifier as skSGD
s_est = skSGD(learning_rate="invscaling", eta0=1.0, n_jobs=12)
e_nm = target + "_sklearn_" + sklearn.__version__
s_report = evaluate(s_est, e_nm, \
                    X_train, y_train, X_test, y_test)

# Precision, Recall and F1 score for each class
print("Frovedis SGDClassifier metrices: ")
print(f_report)
print("Sklearn SGDClassifier metrices: ")
print(s_report)

Frovedis SGDClassifier metrices: 
              precision    recall  f1-score   support

           0       0.60      0.94      0.73     38123
           1       0.91      0.48      0.63     46806

    accuracy                           0.69     84929
   macro avg       0.75      0.71      0.68     84929
weighted avg       0.77      0.69      0.68     84929

Sklearn SGDClassifier metrices: 
              precision    recall  f1-score   support

           0       0.77      0.72      0.74     38123
           1       0.78      0.82      0.80     46806

    accuracy                           0.78     84929
   macro avg       0.78      0.77      0.77     84929
weighted avg       0.78      0.78      0.78     84929



### <font color='green'> 4. Performance summary<font>

In [16]:
# ---- evaluation summary ----
summary = pd.DataFrame(OrderedDict({ "estimator": estimator_name,
                                     "train time": train_time,
                                     "test time": test_time,
                                     "accuracy": accuracy,
                                     "precision": precision,
                                     "recall": recall,
                                     "f1-score": f1
                                  }))
summary

Unnamed: 0,estimator,train time,test time,accuracy,precision,recall,f1-score
0,SVC_frovedis_0.9.10,941.4206,315.0987,0.788612,0.786607,0.785453,0.785965
1,SVC_sklearn_0.24.1,2078.4573,947.4081,0.640041,0.659135,0.652947,0.638762
2,decision_tree_frovedis_0.9.10,2.8202,0.2429,0.639499,0.635342,0.627583,0.627328
3,decision_tree_sklearn_0.24.1,34.3698,0.1334,0.574303,0.572875,0.573583,0.572458
4,random_forest_frovedis_0.9.10,774.0633,24.7382,0.692331,0.694913,0.678575,0.678982
5,random_forest_sklearn_0.24.1,1616.4136,0.6313,0.694816,0.695298,0.68288,0.683811
6,gradient_boosting_frovedis_0.9.10,46.0937,2.9796,0.727419,0.726082,0.719701,0.721224
7,gradient_boosting_sklearn_0.24.1,4699.1478,0.6603,0.726313,0.726868,0.716526,0.718268
8,Linear_SVC_frovedis_0.9.10,132.3494,0.1865,0.708392,0.758359,0.728768,0.703778
9,Linear_SVC_sklearn_0.24.1,95.436,0.0725,0.776566,0.774965,0.771943,0.773069
