# IMDB sentiment analysis with scikit-learn

## Fetch data

In [1]:
import tensorflow as tf
import numpy as np

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=4000)

word_index = tf.keras.datasets.imdb.get_word_index()
index2word = dict((i + 3, word) for (word, i) in word_index.items())
index2word[0] = '[pad]'
index2word[1] = '[bos]'
index2word[2] = '[oov]'
x_train = np.array([' '.join([index2word[idx] for idx in text]) for text in x_train])
x_test = np.array([' '.join([index2word[idx] for idx in text]) for text in x_test])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [2]:
len(word_index)

88584

In [8]:
x_train[0:3]

array(["[bos] this film was just brilliant casting location scenery story direction [oov] really suited the part they played and you could just imagine being there robert [oov] is an amazing actor and now the same being director [oov] father came from the same [oov] island as myself so i loved the fact there was a real connection with this film the witty [oov] throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for [oov] and would recommend it to everyone to watch and the fly [oov] was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also [oov] to the two little [oov] that played the [oov] of norman and paul they were just brilliant children are often left out of the [oov] list i think because the stars that play them all grown up are such a big [oov] for the whole film but these children are amazing and should be [oov] for what they hav

## Alternative:

In [4]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xvf  'aclImdb_v1.tar.gz'

--2022-01-26 17:56:43--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: 'aclImdb_v1.tar.gz'

     0K .......... .......... .......... .......... ..........  0% 85,6K 15m59s
    50K .......... .......... .......... .......... ..........  0%  217K 11m8s
   100K .......... .......... .......... .......... ..........  0%  266K 9m8s
   150K .......... .......... .......... .......... ..........  0%  268K 8m7s
   200K .......... .......... .......... .......... ..........  0% 2,50M 6m36s
   250K .......... .......... .......... .......... ..........  0%  277K 6m19s
   300K .......... .......... .......... .......... ..........  0% 3,73M 5m28s
   350K .......... .......... .......... .......... ..........  0%  300K 5m21s
   400K .........

 12650K .......... .......... .......... .......... .......... 15% 2,93M 43s
 12700K .......... .......... .......... .......... .......... 15% 10,2M 42s
 12750K .......... .......... .......... .......... .......... 15% 2,89M 42s
 12800K .......... .......... .......... .......... .......... 15% 3,51M 42s
 12850K .......... .......... .......... .......... .......... 15% 4,99M 42s
 12900K .......... .......... .......... .......... .......... 15% 4,31M 42s
 12950K .......... .......... .......... .......... .......... 15% 5,28M 42s
 13000K .......... .......... .......... .......... .......... 15% 4,06M 42s
 13050K .......... .......... .......... .......... .......... 15% 6,15M 41s
 13100K .......... .......... .......... .......... .......... 16% 2,09M 41s
 13150K .......... .......... .......... .......... .......... 16% 2,77M 41s
 13200K .......... .......... .......... .......... .......... 16% 5,76M 41s
 13250K .......... .......... .......... .......... .......... 16% 2,15M 41s

## Create the vocabulary

In [5]:
vocabulary = list()
for text in x_train:
  tokens = text.split()
  vocabulary.extend(tokens)

vocabulary = set(vocabulary)
print(len(vocabulary))

3998



 74650K .......... .......... .......... .......... .......... 90%  428M 3s
 74700K .......... .......... .......... .......... .......... 90% 7,27M 3s
 74750K .......... .......... .......... .......... .......... 91% 3,60M 3s
 74800K .......... .......... .......... .......... .......... 91% 1,22M 3s
 74850K .......... .......... .......... .......... .......... 91% 45,1M 3s
 74900K .......... .......... .......... .......... .......... 91% 3,07M 3s
 74950K .......... .......... .......... .......... .......... 91%  360M 3s
 75000K .......... .......... .......... .......... .......... 91% 4,12M 3s
 75050K .......... .......... .......... .......... .......... 91% 12,5M 3s
 75100K .......... .......... .......... .......... .......... 91% 5,42M 3s
 75150K .......... .......... .......... .......... .......... 91% 2,86M 3s
 75200K .......... .......... .......... .......... .......... 91% 1,71M 3s
 75250K .......... .......... .......... .......... .......... 91% 1,59M 3s
 75300K ...

## Create binary vectors 

In [6]:
from tqdm import tqdm

x_train_binary = list()
x_test_binary = list()

for text in tqdm(x_train):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_train_binary.append(binary_vector)

x_train_binary = np.array(x_train_binary) # an array that in every position there is a sentiment

for text in tqdm(x_test):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_test_binary.append(binary_vector)

x_test_binary = np.array(x_test_binary) # an array that in every position there is a sentiment

100%|████████████████████████████████████████████████████████████████████████████| 25000/25000 [06:10<00:00, 67.39it/s]
100%|████████████████████████████████████████████████████████████████████████████| 25000/25000 [06:00<00:00, 69.42it/s]


In [26]:
print(x_train_binary[0][20:500])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [22]:
print(y_train[0])

1


## Naive Bayes Classifier


In [40]:
def train():
    
def classify():
    
def test():
    

BernoulliNB()

In [43]:
from sklearn.metrics import classification_report

print(classification_report(y_test, nb.predict(x_test_binary)))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84     12500
           1       0.84      0.84      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



## Decision Tree classifier 

In [44]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(x_train_binary, y_train)

DecisionTreeClassifier(criterion='entropy')

In [62]:
print(classification_report(y_train, dt.predict(x_train_binary)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12500
           1       1.00      1.00      1.00     12500

    accuracy                           1.00     25000
   macro avg       1.00      1.00      1.00     25000
weighted avg       1.00      1.00      1.00     25000



In [45]:
print(classification_report(y_test, dt.predict(x_test_binary)))

              precision    recall  f1-score   support

           0       0.71      0.71      0.71     12500
           1       0.71      0.71      0.71     12500

    accuracy                           0.71     25000
   macro avg       0.71      0.71      0.71     25000
weighted avg       0.71      0.71      0.71     25000



## Random Forest classifier

In [46]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion='entropy')
rf.fit(x_train_binary, y_train)

RandomForestClassifier(criterion='entropy')

In [47]:
print(classification_report(y_test, rf.predict(x_test_binary)))

              precision    recall  f1-score   support

           0       0.84      0.85      0.84     12500
           1       0.84      0.84      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



## AdaBoost classifier

In [48]:
from sklearn.ensemble import AdaBoostClassifier

ab = AdaBoostClassifier()
ab.fit(x_train_binary, y_train)

AdaBoostClassifier()

In [50]:
print(classification_report(y_test, ab.predict(x_test_binary)))

              precision    recall  f1-score   support

           0       0.83      0.77      0.80     12500
           1       0.79      0.84      0.81     12500

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000

