# **Sources**
* IMDB Dataset: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

# **Data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

url = "/content/drive/MyDrive/Temp/Datasets/IMDB Dataset.csv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
df = pd.read_csv(url)

df.drop_duplicates(inplace=True)

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Preparing Y

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y = le.fit_transform(df.sentiment)
CLASSES = le.classes_

## Preparing X

### Preprocessing

In [None]:
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import emoji
from nltk.tokenize import word_tokenize

import string
punc = string.punctuation
abbv = {
    "AFAIK":"as far as I know",
	"IMO":	"in my opinion",
	"IMHO":	"in my humble opinion",
	"LGTM":	"look good to me",
	"AKA":	"also know as",
	"ASAP":	"as sone as possible",
	"BTW":	"by the way",
	"FAQ":	"frequently asked questions",
	"DIY":	"do it yourself",
	"DM":	"direct message",
	"FYI":	"for your information",
	"IC":	"i see",
	"IOW":	"in other words",
	"IIRC":	"If I Remember Correctly",
	"icymi":"In case you missed it",
	"CUZ":	"because",
	"COS":	"because",
	"nv":	"nevermind",
	"PLZ":	"please",
}

from nltk.corpus import stopwords
stopwords.words('english')

import re
html_pattern = re.compile('<.*?>')
urls_pattern = re.compile(r'https?://\S+|www\.\S+')
emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F"  # emoticons
	u"\U0001F300-\U0001F5FF"  # symbols & pictographs
	u"\U0001F680-\U0001F6FF"  # transport & map symbols
	u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
"]+", flags=re.UNICODE)


from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def preprocess(text):

    # Lowercase
    text = text.lower()

    # HTML Tags
    text = html_pattern.sub(r'', text)

    # urls
    text = urls_pattern.sub(r'', text)

    # punctuations
    text = text.translate(str.maketrans("", "", punc))

    # Emojis
    text = emoji.demojize(text)
    text = emoji_pattern.sub(r'', text)

    new_text = []

    for word in text.split(" "):

        # abbreviations
        word = abbv.get(word.upper(), word)
            
        # Stemming
        word = ps.stem(word)

        new_text.append(word)

    text = " ".join(new_text)

    return text

preprocess("This is the best movie I have ever watched")

'thi is the best movi i have ever watch'

In [None]:
# from tqdm import tqdm

# # cleaned = df.review.apply(preprocess)

# cleaned = []
# for i in tqdm(df.review):
#     cleaned.append(preprocess(i))

In [None]:
import json

# WRITTING
# with open("/content/drive/MyDrive/Temp/dumps/cleaned_reviews1.json", 'w') as f:
#     json.dump(cleaned, f)

# READING
with open("/content/drive/MyDrive/Temp/dumps/cleaned_reviews1.json", 'rb') as f:
    cleaned = json.load(f)

### Splitting

In [None]:
from sklearn.model_selection import train_test_split

cleaned_train, cleaned_test, Y_train, Y_test = train_test_split(
	cleaned,
	Y,
	test_size=0.2,
	random_state=42,
	stratify=Y
)

# **Hyper Paramters**

In [None]:
MAX_FEATURES = 5000
NGRAM_RANGE = (1, 1)

# **Encoders**

In [None]:
encoders = {}

# TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
encoders['TfidfVectorizer'] = TfidfVectorizer(
    lowercase = True,
    stop_words = 'english',
    max_features = MAX_FEATURES,
    binary = False,
    sublinear_tf = True,
    ngram_range=NGRAM_RANGE,
)

# BiGram_tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
encoders['BiGram_tfidf'] = TfidfVectorizer(
    lowercase = True,
    stop_words = 'english',
    max_features = MAX_FEATURES,
    binary = False,
    sublinear_tf = True,
    ngram_range=(2, 2),
)

# UniBiGram_tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
encoders['UniBiGram_tfidf'] = TfidfVectorizer(
    lowercase = True,
    stop_words = 'english',
    max_features = MAX_FEATURES,
    binary = False,
    sublinear_tf = True,
    ngram_range=(1, 2),
)

# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
encoders['CountVectorizer'] = CountVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=MAX_FEATURES,
    binary=False,
    ngram_range=NGRAM_RANGE,
)

# BiGram
from sklearn.feature_extraction.text import CountVectorizer
encoders['BiGram'] = CountVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=MAX_FEATURES,
    binary=False,
    ngram_range=(2, 2),
)

# UniBiGram
from sklearn.feature_extraction.text import CountVectorizer
encoders['UniBiGram'] = CountVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=MAX_FEATURES,
    binary=False,
    ngram_range=(1, 2),
)

In [None]:
# TAKES TOO MUCH TIME
"""

    import gensim
    from gensim.utils import simple_preprocess
    from nltk import sent_tokenize
    from nltk.corpus import stopwords
    from tqdm import tqdm

    sw_list = stopwords.words('english')

    story = []
    for doc in tqdm(cleaned_train):
        raw_sent = sent_tokenize(doc)
        for sent in raw_sent:
            sent = " ".join([i for i in sent.split() if i not in sw_list])
            story.append(simple_preprocess(sent))

    model = gensim.models.Word2Vec(
        window=10,
        min_count=2,
        size=MAX_FEATURES,
    )
    model.build_vocab(story)
    model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

    import numpy as np
    def document_vector(doc):
        return np.mean(
            [model.wv[i] for i in doc.split() if i in model.wv.index2word],
            axis=0
        )

    X_train = []
    for doc in tqdm(cleaned_train):
        X_train.append(document_vector(doc))

    X_test = []
    for doc in tqdm(cleaned_test):
        X_test.append(document_vector(doc))

"""
""

''

# **Models**

In [None]:
models = {}

# GaussianNB
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
models['GaussianNB'] = GaussianNB

# BernoulliNB
models['BernoulliNB'] = BernoulliNB

# MultinomialNB
models['MultinomialNB'] = MultinomialNB

# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
models['RandomForestClassifier'] = RandomForestClassifier

# **Pipeline**

In [None]:
encoders.keys(), models.keys()

(dict_keys(['TfidfVectorizer', 'BiGram_tfidf', 'UniBiGram_tfidf', 'CountVectorizer', 'BiGram', 'UniBiGram']),
 dict_keys(['GaussianNB', 'BernoulliNB', 'MultinomialNB', 'RandomForestClassifier']))

In [None]:
# MODELS TO SKIP
to_skip = {
    'BiGram_tfidf': ['RandomForestClassifier'],
    'UniBiGram_tfidf': ['RandomForestClassifier'],
    'BiGram': ['RandomForestClassifier'],
    'UniBiGram': ['RandomForestClassifier'],
}

In [None]:
pipeline = {}
for encoder_name, encoder in encoders.items():
    pipeline[encoder_name] = {
        "encoder": encoder,
        "models": {}
    }
    for model_name, model in models.items():
        if model_name in to_skip.get(encoder_name, []):
            continue
        
        pipeline[encoder_name]['models'][model_name] = model()

In [None]:
pipeline

{'TfidfVectorizer': {'encoder': TfidfVectorizer(max_features=5000, stop_words='english', sublinear_tf=True),
  'models': {'GaussianNB': GaussianNB(),
   'BernoulliNB': BernoulliNB(),
   'MultinomialNB': MultinomialNB(),
   'RandomForestClassifier': RandomForestClassifier()}},
 'BiGram_tfidf': {'encoder': TfidfVectorizer(max_features=5000, ngram_range=(2, 2), stop_words='english',
                  sublinear_tf=True),
  'models': {'GaussianNB': GaussianNB(),
   'BernoulliNB': BernoulliNB(),
   'MultinomialNB': MultinomialNB()}},
 'UniBiGram_tfidf': {'encoder': TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english',
                  sublinear_tf=True),
  'models': {'GaussianNB': GaussianNB(),
   'BernoulliNB': BernoulliNB(),
   'MultinomialNB': MultinomialNB()}},
 'CountVectorizer': {'encoder': CountVectorizer(max_features=5000, stop_words='english'),
  'models': {'GaussianNB': GaussianNB(),
   'BernoulliNB': BernoulliNB(),
   'MultinomialNB': MultinomialNB(),
   'R

# **Training On Data**

In [None]:
import joblib
import time
save_path = "./drive/MyDrive/Temp/dumps/"

pipeline

{'TfidfVectorizer': {'encoder': TfidfVectorizer(max_features=5000, stop_words='english', sublinear_tf=True),
  'models': {'GaussianNB': GaussianNB(),
   'BernoulliNB': BernoulliNB(),
   'MultinomialNB': MultinomialNB(),
   'RandomForestClassifier': RandomForestClassifier()}},
 'BiGram_tfidf': {'encoder': TfidfVectorizer(max_features=5000, ngram_range=(2, 2), stop_words='english',
                  sublinear_tf=True),
  'models': {'GaussianNB': GaussianNB(),
   'BernoulliNB': BernoulliNB(),
   'MultinomialNB': MultinomialNB()}},
 'UniBiGram_tfidf': {'encoder': TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english',
                  sublinear_tf=True),
  'models': {'GaussianNB': GaussianNB(),
   'BernoulliNB': BernoulliNB(),
   'MultinomialNB': MultinomialNB()}},
 'CountVectorizer': {'encoder': CountVectorizer(max_features=5000, stop_words='english'),
  'models': {'GaussianNB': GaussianNB(),
   'BernoulliNB': BernoulliNB(),
   'MultinomialNB': MultinomialNB(),
   'R

## Fitting on Ecoders

In [None]:
for encoder_name, encoder_data in pipeline.items():
    print(f"Fitting on Encoder: '{encoder_name}' ......... ", end="")
    i = time.time()

    encoder_data['encoder'].fit(cleaned_train)
    joblib.dump(encoder_data['encoder'], f"{save_path}encoders/{encoder_name}.pkl")

    print(f"Done ({round(time.time() - i, 3)}ms)")

Fitting on Encoder: 'TfidfVectorizer' ......... Done (7.979ms)
Fitting on Encoder: 'BiGram_tfidf' ......... Done (38.287ms)
Fitting on Encoder: 'UniBiGram_tfidf' ......... Done (44.096ms)
Fitting on Encoder: 'CountVectorizer' ......... Done (7.675ms)
Fitting on Encoder: 'BiGram' ......... Done (37.17ms)
Fitting on Encoder: 'UniBiGram' ......... Done (43.285ms)


## Fitting on Models

In [None]:
# Fitting the Models
for encoder_name, encoder_data in pipeline.items():
    print(encoder_name)
    X = encoder_data['encoder'].transform(cleaned_train).toarray()
    for model_name, model in encoder_data['models'].items():
        print(f"\t Fitting on '{model_name}' ......... ", end="")
        i = time.time()

        model.fit(X, Y_train)
        joblib.dump(model, f"{save_path}encoders/{encoder_name}_{model_name}.pkl")

        print(f"Done ({round(time.time() - i, 3)}ms)")


TfidfVectorizer
	 Fitting on 'GaussianNB' ......... Done (3.624ms)
	 Fitting on 'BernoulliNB' ......... Done (2.371ms)
	 Fitting on 'MultinomialNB' ......... Done (0.866ms)
	 Fitting on 'RandomForestClassifier' ......... Done (151.17ms)
BiGram_tfidf
	 Fitting on 'GaussianNB' ......... Done (3.193ms)
	 Fitting on 'BernoulliNB' ......... Done (2.72ms)
	 Fitting on 'MultinomialNB' ......... Done (0.878ms)
UniBiGram_tfidf
	 Fitting on 'GaussianNB' ......... Done (2.567ms)
	 Fitting on 'BernoulliNB' ......... Done (2.273ms)
	 Fitting on 'MultinomialNB' ......... Done (0.84ms)
CountVectorizer
	 Fitting on 'GaussianNB' ......... Done (3.609ms)
	 Fitting on 'BernoulliNB' ......... Done (12.908ms)
	 Fitting on 'MultinomialNB' ......... Done (6.98ms)
	 Fitting on 'RandomForestClassifier' ......... Done (122.544ms)
BiGram
	 Fitting on 'GaussianNB' ......... Done (3.565ms)
	 Fitting on 'BernoulliNB' ......... Done (7.952ms)
	 Fitting on 'MultinomialNB' ......... Done (8.754ms)
UniBiGram
	 Fitting 

# **Testing The Data**

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

metrics = {}
for encoder_name, encoder_data in pipeline.items():
    print(encoder_name)
    encoder = encoder_data['encoder']
    X_train = encoder.transform(cleaned_train).toarray()
    X_test = encoder.transform(cleaned_test).toarray()
    for model_name, model in encoder_data['models'].items():
        print(f"\t Predicting on '{model_name}' ......... ", end="")

        pred_train = model.predict(X_train)
        pred_test = model.predict(X_test)
        metrics[f"{encoder_name} - {model_name}"] = {
            "train_accuracy": accuracy_score(Y_train, pred_train),
            "train_confusion_matrix": confusion_matrix(Y_train, pred_train),
            "train_classification_report": classification_report(Y_train, pred_train, zero_division=0),
            "test_accuracy": accuracy_score(Y_test, pred_test),
            "test_confusion_matrix": confusion_matrix(Y_test, pred_test),
            "test_classification_report": classification_report(Y_test, pred_test, zero_division=0)
        }

        print("Done")

TfidfVectorizer
	 Predicting on 'GaussianNB' ......... Done
	 Predicting on 'BernoulliNB' ......... Done
	 Predicting on 'MultinomialNB' ......... Done
	 Predicting on 'RandomForestClassifier' ......... Done
BiGram_tfidf
	 Predicting on 'GaussianNB' ......... Done
	 Predicting on 'BernoulliNB' ......... Done
	 Predicting on 'MultinomialNB' ......... Done
UniBiGram_tfidf
	 Predicting on 'GaussianNB' ......... Done
	 Predicting on 'BernoulliNB' ......... Done
	 Predicting on 'MultinomialNB' ......... Done
CountVectorizer
	 Predicting on 'GaussianNB' ......... Done
	 Predicting on 'BernoulliNB' ......... Done
	 Predicting on 'MultinomialNB' ......... Done
	 Predicting on 'RandomForestClassifier' ......... Done
BiGram
	 Predicting on 'GaussianNB' ......... Done
	 Predicting on 'BernoulliNB' ......... Done
	 Predicting on 'MultinomialNB' ......... Done
UniBiGram
	 Predicting on 'GaussianNB' ......... Done
	 Predicting on 'BernoulliNB' ......... Done
	 Predicting on 'MultinomialNB' .........

# **Preview**

## Accuracy

In [None]:
best_train = 0
best_train_model = None
best_test = 0
best_test_model = None
for key, val in metrics.items():
    train_accuracy = val['train_accuracy']
    test_accuracy = val['test_accuracy']

    if train_accuracy > best_train:
        best_train = train_accuracy
        best_train_model = key
    if test_accuracy > best_test:
        best_test = test_accuracy
        best_test_model = key

print("Best Train Model :", best_train_model, best_train)
print("Best Test Model  :", best_test_model, best_test)

Best Train Model : TfidfVectorizer - RandomForestClassifier 1.0
Best Test Model  : UniBiGram_tfidf - MultinomialNB 0.8573157204799838


In [None]:
for key, val in metrics.items():
    print(key)
    print("\t Train Accuracy: ", val['train_accuracy'])
    print("\t Test Accuracy:  ", val['test_accuracy'])

TfidfVectorizer - GaussianNB
	 Train Accuracy:  0.8204462372368587
	 Test Accuracy:   0.7990319653120903
TfidfVectorizer - BernoulliNB
	 Train Accuracy:  0.8509517206605319
	 Test Accuracy:   0.845921145507714
TfidfVectorizer - MultinomialNB
	 Train Accuracy:  0.8591201310979453
	 Test Accuracy:   0.8544922859735807
TfidfVectorizer - RandomForestClassifier
	 Train Accuracy:  1.0
	 Test Accuracy:   0.8425935262680246
BiGram_tfidf - GaussianNB
	 Train Accuracy:  0.8315139291566872
	 Test Accuracy:   0.8002420086719774
BiGram_tfidf - BernoulliNB
	 Train Accuracy:  0.8226143955628388
	 Test Accuracy:   0.8085106382978723
BiGram_tfidf - MultinomialNB
	 Train Accuracy:  0.8423295096432624
	 Test Accuracy:   0.8249470606030049
UniBiGram_tfidf - GaussianNB
	 Train Accuracy:  0.8504979200806757
	 Test Accuracy:   0.8353332661087022
UniBiGram_tfidf - BernoulliNB
	 Train Accuracy:  0.8567502836253624
	 Test Accuracy:   0.8483412322274881
UniBiGram_tfidf - MultinomialNB
	 Train Accuracy:  0.863481

## Confusion Matrix

In [None]:
for key, val in metrics.items():
    print(" ------ " + key + " ------ ")
    print("Train:\n", val['train_confusion_matrix'])
    print("Test: \n", val['test_confusion_matrix'])

 ------ TfidfVectorizer - GaussianNB ------ 
Train:
 [[16463  3295]
 [ 3827 16080]]
Test: 
 [[4025  915]
 [1078 3899]]
 ------ TfidfVectorizer - BernoulliNB ------ 
Train:
 [[16954  2804]
 [ 3108 16799]]
Test: 
 [[4202  738]
 [ 790 4187]]
 ------ TfidfVectorizer - MultinomialNB ------ 
Train:
 [[16838  2920]
 [ 2668 17239]]
Test: 
 [[4197  743]
 [ 700 4277]]
 ------ TfidfVectorizer - RandomForestClassifier ------ 
Train:
 [[19758     0]
 [    0 19907]]
Test: 
 [[4148  792]
 [ 769 4208]]
 ------ BiGram_tfidf - GaussianNB ------ 
Train:
 [[15999  3759]
 [ 2924 16983]]
Test: 
 [[3844 1096]
 [ 885 4092]]
 ------ BiGram_tfidf - BernoulliNB ------ 
Train:
 [[14923  4835]
 [ 2201 17706]]
Test: 
 [[3635 1305]
 [ 594 4383]]
 ------ BiGram_tfidf - MultinomialNB ------ 
Train:
 [[16202  3556]
 [ 2698 17209]]
Test: 
 [[3964  976]
 [ 760 4217]]
 ------ UniBiGram_tfidf - GaussianNB ------ 
Train:
 [[16622  3136]
 [ 2794 17113]]
Test: 
 [[4102  838]
 [ 795 4182]]
 ------ UniBiGram_tfidf - BernoulliNB

## Classification Report

In [None]:
for key, val in metrics.items():
    print(" ------ " + key + " ------ ")
    print(" -- Train -- ")
    print(val['train_classification_report'])
    print(" -- Test -- ")
    print(val['test_classification_report'])

 ------ TfidfVectorizer - GaussianNB ------ 
 -- Train -- 
              precision    recall  f1-score   support

           0       0.81      0.83      0.82     19758
           1       0.83      0.81      0.82     19907

    accuracy                           0.82     39665
   macro avg       0.82      0.82      0.82     39665
weighted avg       0.82      0.82      0.82     39665

 -- Test -- 
              precision    recall  f1-score   support

           0       0.79      0.81      0.80      4940
           1       0.81      0.78      0.80      4977

    accuracy                           0.80      9917
   macro avg       0.80      0.80      0.80      9917
weighted avg       0.80      0.80      0.80      9917

 ------ TfidfVectorizer - BernoulliNB ------ 
 -- Train -- 
              precision    recall  f1-score   support

           0       0.85      0.86      0.85     19758
           1       0.86      0.84      0.85     19907

    accuracy                           0.85     39

## Realtime Testing

In [None]:
def realtime_test(text, orig, show=True, correct_only=False):
    cleaned_text = preprocess(text)
    correct = []
    for encoder_name, encoder_data in pipeline.items():
        X = encoder_data['encoder'].transform([cleaned_text]).toarray()
        for model_name, model in encoder_data['models'].items():
            pred = model.predict(X)[0]
            model_label = f"{encoder_name} - {model_name}"
            if (pred == orig):
                correct.append(model_label)
                if show:
                    print(f"{CLASSES[pred]} - {model_label}")
            elif not correct_only and show:
                print(f"{CLASSES[pred]} - {model_label}")
    if not show:
        return correct

def test_many(texts, origs):
    corrects = []
    for text, orig in zip(texts, origs):
        correct = realtime_test(text, orig, show=False)
        corrects.append(correct)
    common = set(corrects[0])
    for correct in corrects:
        common = set(correct) & common
    
    print("All sentences correctly predicted by these models:")
    for i in common:
        print(".", i, round(metrics[i]['test_accuracy'], 2))

In [None]:
text = "Greatest Movie of all time"
orig = 1

realtime_test(text, orig)

positive - TfidfVectorizer - GaussianNB
positive - TfidfVectorizer - BernoulliNB
positive - TfidfVectorizer - MultinomialNB
positive - TfidfVectorizer - RandomForestClassifier
positive - BiGram_tfidf - GaussianNB
positive - BiGram_tfidf - BernoulliNB
positive - BiGram_tfidf - MultinomialNB
positive - UniBiGram_tfidf - GaussianNB
positive - UniBiGram_tfidf - BernoulliNB
positive - UniBiGram_tfidf - MultinomialNB
negative - CountVectorizer - GaussianNB
positive - CountVectorizer - BernoulliNB
positive - CountVectorizer - MultinomialNB
positive - CountVectorizer - RandomForestClassifier
positive - BiGram - GaussianNB
positive - BiGram - BernoulliNB
positive - BiGram - MultinomialNB
negative - UniBiGram - GaussianNB
positive - UniBiGram - BernoulliNB
positive - UniBiGram - MultinomialNB


In [None]:
text = "Greatest Movie of all time"
orig = 1

realtime_test(text, orig, correct_only=True)

positive - TfidfVectorizer - GaussianNB
positive - TfidfVectorizer - BernoulliNB
positive - TfidfVectorizer - MultinomialNB
positive - TfidfVectorizer - RandomForestClassifier
positive - BiGram_tfidf - GaussianNB
positive - BiGram_tfidf - BernoulliNB
positive - BiGram_tfidf - MultinomialNB
positive - UniBiGram_tfidf - GaussianNB
positive - UniBiGram_tfidf - BernoulliNB
positive - UniBiGram_tfidf - MultinomialNB
positive - CountVectorizer - BernoulliNB
positive - CountVectorizer - MultinomialNB
positive - CountVectorizer - RandomForestClassifier
positive - BiGram - GaussianNB
positive - BiGram - BernoulliNB
positive - BiGram - MultinomialNB
positive - UniBiGram - BernoulliNB
positive - UniBiGram - MultinomialNB


In [None]:
text = "Worst Movie, really hate it"
orig = 0

realtime_test(text, orig, correct_only=True)

negative - TfidfVectorizer - GaussianNB
negative - TfidfVectorizer - BernoulliNB
negative - TfidfVectorizer - MultinomialNB
negative - TfidfVectorizer - RandomForestClassifier
negative - BiGram_tfidf - GaussianNB
negative - BiGram_tfidf - BernoulliNB
negative - BiGram_tfidf - MultinomialNB
negative - UniBiGram_tfidf - GaussianNB
negative - UniBiGram_tfidf - BernoulliNB
negative - UniBiGram_tfidf - MultinomialNB
negative - CountVectorizer - GaussianNB
negative - CountVectorizer - BernoulliNB
negative - CountVectorizer - MultinomialNB
negative - CountVectorizer - RandomForestClassifier
negative - BiGram - GaussianNB
negative - BiGram - BernoulliNB
negative - BiGram - MultinomialNB
negative - UniBiGram - GaussianNB
negative - UniBiGram - BernoulliNB
negative - UniBiGram - MultinomialNB


In [None]:
test_many(
    [
        "amazing movie, really love it",
        "this is going to be my favourite movie of all the time",
        "this movie sucks, I completely hate it",
        "boring movie",
    ],
    [1, 1, 0, 0]
)

All sentences correctly predicted by these models:
. CountVectorizer - MultinomialNB 0.84
. CountVectorizer - RandomForestClassifier 0.84
. BiGram_tfidf - BernoulliNB 0.81
. BiGram_tfidf - GaussianNB 0.8
. UniBiGram_tfidf - GaussianNB 0.84
. BiGram - GaussianNB 0.79
. UniBiGram - MultinomialNB 0.84
. CountVectorizer - BernoulliNB 0.85
. UniBiGram_tfidf - MultinomialNB 0.86
. BiGram - MultinomialNB 0.82
. BiGram_tfidf - MultinomialNB 0.82
. TfidfVectorizer - GaussianNB 0.8
. BiGram - BernoulliNB 0.81
. TfidfVectorizer - RandomForestClassifier 0.84
. TfidfVectorizer - MultinomialNB 0.85
. TfidfVectorizer - BernoulliNB 0.85


In [None]:
test_many(
    [
        "amazing movie, really love it",
        "this is going to be my favourite movie of all the time",
        "this movie sucks, I completely hate it",
        "boring movie",
        'not bad',
    ],
    [1, 1, 0, 0, 1]
)

All sentences correctly predicted by these models:
. BiGram - GaussianNB 0.79
. BiGram_tfidf - BernoulliNB 0.81
. BiGram_tfidf - GaussianNB 0.8
. BiGram - MultinomialNB 0.82
. BiGram_tfidf - MultinomialNB 0.82
. BiGram - BernoulliNB 0.81
