<a href="https://colab.research.google.com/github/danvitoriano/text-classifiers/blob/main/NBSVW_Classifier_for_IMDB_Review_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset

Review on IMDB Dataset (https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews), already labeled.
Columns:

1.   review
2.   sentiment

Sentiments:

1. positive
2. negative


In [13]:
import pandas as pd
dataset = pd.read_csv('/content/drive/My Drive/IMDB_Dataset.csv')
dataset = dataset.sample(n=20000)
dataset.head()

Unnamed: 0,review,sentiment
417,I like Ghost stories. Good ghost stories of bu...,positive
9836,I was intrigued by the nasty boss character as...,negative
528,"""Night of the Living Homeless"" was a fairly st...",positive
9190,I believe the reason this movie did not get th...,positive
26292,While filming an 80's horror movie called 'Hot...,negative


In [14]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

texts=dataset['review'].to_list()

cv = CountVectorizer()   
cv_fit=cv.fit_transform(texts)    
word_list = cv.get_feature_names()

count_list = cv_fit.toarray().sum(axis=0)  
word_freq = dict(zip(word_list,count_list))
word_freq_sorted = {k: v for k, v in sorted(word_freq.items(), key=lambda item: item[1],reverse=True)}
word_freq_sorted

{'the': 266561,
 'and': 129708,
 'of': 115583,
 'to': 107235,
 'is': 84547,
 'br': 79557,
 'it': 75646,
 'in': 74793,
 'this': 60093,
 'that': 57479,
 'was': 38287,
 'as': 36650,
 'movie': 35157,
 'with': 34780,
 'for': 34771,
 'but': 33292,
 'film': 31424,
 'on': 27147,
 'you': 27116,
 'not': 24227,
 'he': 23496,
 'are': 23449,
 'his': 22775,
 'have': 22305,
 'one': 21559,
 'be': 21490,
 'all': 18734,
 'at': 18535,
 'they': 18199,
 'by': 17764,
 'an': 17164,
 'who': 17093,
 'so': 16223,
 'from': 16126,
 'like': 15791,
 'there': 15041,
 'or': 14175,
 'just': 14028,
 'her': 14003,
 'out': 13733,
 'about': 13655,
 'if': 13623,
 'has': 13268,
 'what': 13000,
 'some': 12387,
 'good': 11910,
 'can': 11397,
 'when': 11238,
 'more': 11197,
 'very': 10881,
 'she': 10739,
 'up': 10456,
 'no': 10097,
 'even': 9971,
 'time': 9953,
 'would': 9920,
 'my': 9844,
 'which': 9370,
 'only': 9295,
 'really': 9214,
 'see': 9159,
 'story': 9082,
 'their': 9064,
 'had': 8823,
 'me': 8533,
 'were': 8499,
 'w

# Stopwords
Refine filter

In [15]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
word_freq_final = [word for word in list(word_freq_sorted.items()) if word[0] not in stop_words]
word_freq_final

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('br', 79557),
 ('movie', 35157),
 ('film', 31424),
 ('one', 21559),
 ('like', 15791),
 ('good', 11910),
 ('even', 9971),
 ('time', 9953),
 ('would', 9920),
 ('really', 9214),
 ('see', 9159),
 ('story', 9082),
 ('well', 8476),
 ('much', 7782),
 ('bad', 7407),
 ('great', 7312),
 ('people', 7301),
 ('get', 7126),
 ('also', 7057),
 ('first', 6898),
 ('made', 6357),
 ('make', 6351),
 ('could', 6288),
 ('way', 6178),
 ('movies', 6020),
 ('think', 5645),
 ('characters', 5639),
 ('watch', 5617),
 ('films', 5511),
 ('character', 5504),
 ('two', 5397),
 ('many', 5359),
 ('seen', 5296),
 ('plot', 5284),
 ('love', 5243),
 ('life', 5192),
 ('never', 5192),
 ('show', 5137),
 ('acting', 5097),
 ('best', 5009),
 ('know', 4983),
 ('little', 4836),
 ('ever', 4780),
 ('man', 4716),
 ('better', 4460),
 ('end', 4386),
 ('scene', 4386),
 ('still', 4341),
 ('scenes', 4239),
 ('say', 4189),
 ('something', 4058),
 ('go', 3969),
 ('back', 3887),
 ('thing', 3760),
 ('real', 3709),
 ('watching', 3688),
 ('actor

# Word Frequency
List 

In [16]:
zipf_dist = pd.DataFrame(word_freq_final,columns=['word','freq'])
zipf_dist.head(20)

Unnamed: 0,word,freq
0,br,79557
1,movie,35157
2,film,31424
3,one,21559
4,like,15791
5,good,11910
6,even,9971
7,time,9953
8,would,9920
9,really,9214


# NBVSM Classifier

In [17]:
#import ktrain and keras
!pip install ktrain
import keras
import ktrain
from ktrain import text



## Pre-processing

In [18]:
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(dataset, 
                                                                   'review',
                                                                   label_columns='sentiment',
                                                                   maxlen=64, 
                                                                   max_features=10000,
                                                                   preprocess_mode='standard',
                                                                   lang="English",
                                                                   ngram_range=1,
                                                                   val_pct = 0.1,
                                                                   )

language: English
Word Counts: 77666
Nrows: 18000
18000 train sequences
train sequence lengths:
	mean : 220
	95percentile : 558
	99percentile : 861
x_train shape: (18000,64)
y_train shape: (18000, 2)
Is Multi-Label? False
2000 test sequences
test sequence lengths:
	mean : 223
	95percentile : 569
	99percentile : 897
x_test shape: (2000,64)
y_test shape: (2000, 2)


## Training

In [19]:
model = text.text_classifier('nbsvm', (x_train, y_train) , preproc=preproc)
classifier = ktrain.get_learner(model, 
                             train_data=(x_train, y_train), 
                             val_data=(x_test, y_test)
                             )

Is Multi-Label? False
compiling word ID features...
maxlen is 64
building document-term matrix... this may take a few moments...
rows: 1-10000
rows: 10001-18000
computing log-count ratios...
done.


## Fit One Cycle

In [20]:
classifier.fit_onecycle(0.01,3)



begin training using onecycle policy with max lr of 0.01...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fdef77e07f0>

## Validate

In [21]:
classifier.validate()

              precision    recall  f1-score   support

           0       0.83      0.82      0.83      1040
           1       0.81      0.81      0.81       960

    accuracy                           0.82      2000
   macro avg       0.82      0.82      0.82      2000
weighted avg       0.82      0.82      0.82      2000



array([[856, 184],
       [178, 782]])

# Predictor

In [22]:
predictor = ktrain.get_predictor(classifier.model, preproc)
predictor.get_classes()

['negative', 'positive']

## Testing with new text

In [25]:
predictor.predict('A movie for dumb people')

'negative'

In [26]:
predictor.predict('A movie for smart people')

'positive'