<a href="https://colab.research.google.com/github/gorzanskik-ai/intro-to-ann/blob/main/07_rnn/02_text_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

In [2]:
#!wget https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
#!unzip -q reviews.zip

In [3]:
data_dir = './reviews'
train_dir = os.path.join(data_dir, 'train')

train_texts = []
train_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            train_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                train_labels.append(0)
            else:
                train_labels.append(1)

In [4]:
test_dir = os.path.join(data_dir, 'test')

test_texts = []
test_labels = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            test_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                test_labels.append(0)
            else:
                test_labels.append(1)

In [5]:
train_texts[:5]

["Oh God,what an idiotic movie!Incredibly cheap with fake special effects(the creature is played by one guy in lame costume)and stupid plot.All dialogues are unbelievably bad and these actors(HA!HA!HA!)...they're simply ludicrous.For example I have never seen so annoying characters like in this junk(these dumb kids or pregnant woman with his husband and many more).All in all,this is a great entertainment if you're drunk.Avoid it like the plague.Am I drunk?I don't think so...",
 'I am a student of film, and have been for several years. And the concept of a cyber, kung-fu, satirical chimpanzee had me wondering, "Is this the film that\'s going to break the mold?" Let\'s face it, America has never been let down by any piece of cinema that features a simian costar. After such great classics as "Monkey Trouble" and "Dunston Checks In", I thought that the best ideas were already taken. But then comes "Funky Monkey". I laughed, I cried, I contemplated suicide.<br /><br />Now I\'ve read about d

In [6]:
train_labels[:5]

[0, 0, 0, 0, 0]

In [7]:
maxlen = 100   # skracamy recenzje do 100 słów
num_words = 10000    # 10000 najczęściej pojawiających się słów
embedding_dim = 100

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_texts)

In [8]:
list(tokenizer.index_word.items())[:20]

[(1, 'the'),
 (2, 'and'),
 (3, 'a'),
 (4, 'of'),
 (5, 'to'),
 (6, 'is'),
 (7, 'br'),
 (8, 'in'),
 (9, 'it'),
 (10, 'i'),
 (11, 'this'),
 (12, 'that'),
 (13, 'was'),
 (14, 'as'),
 (15, 'for'),
 (16, 'with'),
 (17, 'movie'),
 (18, 'but'),
 (19, 'film'),
 (20, 'on')]

In [9]:
sequences = tokenizer.texts_to_sequences(train_texts)
print(sequences[:3])    #recenzje zapisane za pomocą indeksów

[[446, 555, 48, 32, 3184, 17, 962, 702, 16, 1211, 315, 299, 1, 1656, 6, 253, 31, 28, 229, 8, 832, 2289, 2, 375, 111, 29, 3950, 23, 3789, 75, 2, 131, 153, 2665, 2665, 2665, 503, 328, 2753, 15, 459, 10, 25, 112, 107, 35, 613, 102, 37, 8, 11, 2576, 131, 989, 359, 39, 2735, 252, 16, 24, 655, 2, 108, 50, 29, 8, 29, 11, 6, 3, 84, 718, 44, 332, 1816, 794, 9, 37, 1, 3185, 241, 10, 1816, 10, 89, 101, 35], [10, 241, 3, 1467, 4, 19, 2, 25, 74, 15, 447, 150, 2, 1, 1117, 4, 3, 2130, 1876, 5924, 66, 69, 1525, 6, 11, 1, 19, 195, 167, 5, 985, 1, 9259, 900, 390, 9, 931, 45, 112, 74, 384, 177, 31, 98, 415, 4, 434, 12, 941, 3, 100, 138, 84, 2232, 14, 3584, 1110, 2, 8, 10, 194, 12, 1, 115, 1005, 68, 457, 620, 18, 92, 263, 8422, 3584, 10, 1495, 10, 3765, 10, 1714, 7, 7, 147, 204, 329, 41, 2718, 6887, 8, 1, 3396, 18, 12, 128, 149, 1257, 135, 291, 59, 984, 138, 3, 2211, 4, 442, 83, 122, 257, 30, 219, 3, 7693, 4, 1657, 10, 1702, 12, 3, 13, 8, 189, 32, 4174, 21, 3, 3584, 30, 29, 187, 10, 13, 249, 12, 1, 1053, 

In [10]:
word_index = tokenizer.word_index
print(f'{len(word_index)} unikatowych słów.')

88582 unikatowych słów.


In [11]:
# skracamy recenzje do pierwszych 100 słów
train_data = pad_sequences(sequences, maxlen=maxlen)
train_data.shape

(25000, 100)

In [12]:
train_data[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,  446,  555,   48,   32, 3184,   17,  962,  702,   16,
        1211,  315,  299,    1, 1656,    6,  253,   31,   28,  229,    8,
         832, 2289,    2,  375,  111,   29, 3950,   23, 3789,   75,    2,
         131,  153, 2665, 2665, 2665,  503,  328, 2753,   15,  459,   10,
          25,  112,  107,   35,  613,  102,   37,    8,   11, 2576,  131,
         989,  359,   39, 2735,  252,   16,   24,  655,    2,  108,   50,
          29,    8,   29,   11,    6,    3,   84,  718,   44,  332, 1816,
         794,    9,   37,    1, 3185,  241,   10, 1816,   10,   89,  101,
          35],
       [  12,  788,  687,   12,   59,   57,   27,   31, 1874,  906,    2,
          22,   76, 8422, 3584,    1, 1326,    5,    1,   17,    3, 2316,
         497,  253,   31, 3951,    3,    2,    1, 5120,  427,   54,   28,
         183,    5,  456,   41,  138,   30,    3,  309,  392, 2316,  497,
           7,    7, 842

In [13]:
train_labels = np.asarray(train_labels)
train_labels

array([0, 0, 0, ..., 1, 1, 1])

In [14]:
#przemieszanie próbek, bo są teraz podzielone na negat. i poz. w połowie
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)
train_data = train_data[indices]
train_labels = train_labels[indices]

train_data.shape

(25000, 100)

In [15]:
# podział na zbiór treningowy i walidacyjny
training_samples = 15000
validation_samples = 10000

X_train = train_data[:training_samples]
y_train = train_labels[:training_samples]
X_val = train_data[training_samples: training_samples + validation_samples]
y_val = train_labels[training_samples: training_samples + validation_samples]

In [16]:
# budowa modelu
# Embedding(input_dim, output_dim)

model = Sequential()
model.add(Embedding(num_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 flatten (Flatten)           (None, 10000)             0         
                                                                 
 dense (Dense)               (None, 16)                160016    
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1,160,033
Trainable params: 1,160,033
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [18]:
history = model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
def plot_hist(history):
    import pandas as pd
    import plotly.graph_objects as go
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['accuracy'], name='accuracy', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_accuracy'], name='val_accuracy', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='accuracy vs. val accuracy', xaxis_title='Epoki', yaxis_title='accuracy', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name='loss', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name='val_loss', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='loss vs. val loss', xaxis_title='Epoki', yaxis_title='loss', yaxis_type='log')
    fig.show()

plot_hist(history)

SimpleRNN

In [20]:
from tensorflow.keras.layers import SimpleRNN, LSTM

In [21]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(SimpleRNN(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          320000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 16)                784       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,801
Trainable params: 320,801
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [23]:
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
plot_hist(history)

LSTM

In [25]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, batch_size=32, epochs=3, validation_data=(X_val, y_val))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 32)          320000    
                                                                 
 lstm (LSTM)                 (None, 16)                3136      
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 323,153
Trainable params: 323,153
Non-trainable params: 0
_________________________________________________________________
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [26]:
sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test_labels)

model.evaluate(X_test, y_test, verbose=0)

[0.3506182134151459, 0.8493599891662598]