<a href="https://colab.research.google.com/github/dmitriy-iliyov/data-science/blob/main/neural_network/lstm/notebook/lab_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
datasets_dir = '/content/drive/MyDrive/data/reviews'

Mounted at /content/drive


In [None]:
import os

os.makedirs('/root/.kaggle', exist_ok=True)
!cp /content/drive/MyDrive/kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

path = "yelp-dataset/yelp-dataset"

dir_name = path.split('/')[1]
current_dir_path = os.path.join(datasets_dir, dir_name)

os.makedirs(current_dir_path, exist_ok=True)

!kaggle datasets download -d {path} -p {current_dir_path}

zip_file_path = os.path.join(current_dir_path, f"{dir_name}.zip")
!unzip -q {zip_file_path} -d {current_dir_path}

os.remove(zip_file_path)

cp: cannot stat '/content/drive/MyDrive/kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset
License(s): other
Downloading yelp-dataset.zip to /content/drive/MyDrive/data/reviews/yelp-dataset
100% 4.07G/4.07G [00:53<00:00, 82.8MB/s]
100% 4.07G/4.07G [00:53<00:00, 81.2MB/s]


In [2]:
import json

def read_data(d):
  reviews = []
  stars = []
  with open(datasets_dir + '/yelp-dataset/yelp_academic_dataset_review.json', encoding='utf-8') as file:
    count = 0
    for line in file:
      jsoned_line = json.loads(line)
      reviews.append(jsoned_line['text'])
      stars.append(jsoned_line['stars'])
      count += 1
      if count >= d:
        break
  return reviews, stars


In [3]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk import *
from nltk.corpus import stopwords


nltk.download('stopwords')


def _start_pre_processing(doc):
    doc = re.sub(r'http[s]?://\S+|www\.\S+', '', doc)
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = doc.lower()
    doc = doc.strip()
    wpt = WordPunctTokenizer()
    tokens = wpt.tokenize(doc)
    custom_stopwords = set(stopwords.words('english')) - {
    'not', 'very', 'never', 'no', 'nothing', 'more', 'less', 'good', 'great', 'happy',
    'excellent', 'amazing', 'bad', 'horrible', 'sad', 'angry', 'worse', 'could', 'should',
    'would', 'might', 'may', 'absolutely', 'completely', 'totally', 'think', 'opinion'
    }
    filtered_tokens = [token for token in tokens if token not in custom_stopwords]
    doc = ' '.join(filtered_tokens)
    return doc


def _str_pre_processing(_str):
    sentences = _str.split('.')
    prepared_corpus = [_start_pre_processing(sentence) for sentence in sentences]
    prepared_corpus = ' '.join(list(filter(None, prepared_corpus)))
    return prepared_corpus


def do_pre_processing(doc):
    if isinstance(doc, str):
        return _str_pre_processing(doc)
    else:
        print("ERROR:   TextPreProcessor can't prepare this type of data.")
        return None

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Input
from keras.initializers import Constant
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.utils import resample
from sklearn.utils import shuffle



def prepare_reviews(reviews, max_text_len, pre_processing):
    if(pre_processing):
        reviews = [do_pre_processing(review) for review in reviews]

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(reviews)
    reviews = tokenizer.texts_to_sequences(reviews)
    print(f'rewiev example: {reviews[0]}')

    reviews = pad_sequences(reviews, maxlen=max_text_len)

    return reviews, len(tokenizer.word_index)


def prepare_stars(stars):
    stars = np.array(stars)
    stars = stars.reshape(-1, 1)
    encoder = OneHotEncoder()
    stars = encoder.fit_transform(stars).toarray()
    stars = np.array(stars).astype(int)
    print(f'star example: {stars[0]}')
    return stars


def downsampling(reviews, stars):
    reviews = np.array(reviews)
    stars = np.array(stars).astype(int)

    class_counts = np.bincount(stars)[1:]
    min_count = np.min(class_counts)
    print(f'class counts before downsampling: {class_counts}')

    balanced_reviews = []
    balanced_stars = []

    for star in np.unique(stars):
      class_reviews = reviews[stars == star]
      class_stars = stars[stars == star]

      if len(class_reviews) > min_count:
          class_reviews_resampled, class_stars_resampled = resample(class_reviews,
                                                                    class_stars,
                                                                    replace=False,
                                                                    n_samples=min_count,
                                                                    random_state=42)
          balanced_reviews.extend(class_reviews_resampled)
          balanced_stars.extend(class_stars_resampled)
      else:
          balanced_reviews.extend(class_reviews)
          balanced_stars.extend(class_stars)

    balanced_reviews = np.array(balanced_reviews)
    balanced_stars = np.array(balanced_stars)

    class_counts = np.bincount(balanced_stars)[1:]
    print(f'class counts after downsampling: {class_counts}')

    balanced_reviews, balanced_stars = shuffle(balanced_reviews, balanced_stars, random_state=42)

    return balanced_reviews, balanced_stars


def prepare_data(d, k, max_text_len, pre_processing = True):

    reviews, stars = read_data(d)

    reviews, stars = downsampling(reviews, stars)

    reviews, word_count = prepare_reviews(reviews, max_text_len, pre_processing)
    print(f'reviews count: {len(reviews)}')

    stars = prepare_stars(stars)


    index = int(k * len(reviews))
    train_data = reviews[:index]
    train_answers = stars[:index]
    test_data = reviews[index:]
    test_answers = stars[index:]

    return train_data, train_answers, test_data, test_answers, word_count


In [5]:
import tensorflow as tf
from tensorflow.keras.callbacks import Callback


class DivergenceEarlyStopping(Callback):
    def __init__(self, patience=3, restore_best_weights=True):
        super().__init__()
        self.monitor = 'loss'
        self.patience = patience
        self.restore_best_weights = restore_best_weights
        self.best_weights = None
        self.wait = 0
        self.stopped_epoch = 0
        self.previous_train_loss = None
        self.previous_val_loss = None

    def on_epoch_end(self, epoch, logs=None):
        train_loss = logs.get(f'{self.monitor}')
        val_loss = logs.get(f'val_{self.monitor}')

        if self.previous_train_loss is None:
            self.previous_train_loss = train_loss
            self.previous_val_loss = val_loss
            return

        if val_loss > train_loss:
            self.stopped_epoch = epoch
            self.model.stop_training = True

        if self.wait >= self.patience:
            self.stopped_epoch = epoch
            self.model.stop_training = True

        self.previous_train_loss = train_loss
        self.previous_val_loss = val_loss

    def on_train_end(self, logs=None):
        if self.stopped_epoch > 0 and self.restore_best_weights:
            self.model.set_weights(self.best_weights)



In [15]:
import time
import tensorflow as tf
from keras.callbacks import EarlyStopping
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Input
from matplotlib import pyplot as plt


class KerasLSTM:

    def __init__(self, word_count, max_length):
        self._model = Sequential([
            Input(shape=(max_length,)),
            Embedding(input_dim=word_count + 1, output_dim=128, input_length=max_length),
            LSTM(128, activation='tanh', return_sequences=False),
            Dense(5, activation='softmax')
        ])
        self._model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    def summary(self):
        self._model.summary()

    def fit(self, train_data, train_answers, validation_split=0.2, epochs=100, batch_size=128):
        start_time = time.time()
        early_stopping = DivergenceEarlyStopping()
        history = self._model.fit(
            train_data, train_answers,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=validation_split,
            callbacks=[early_stopping]
        )
        execution_time = time.time() - start_time
        print(f"Training completed in {execution_time:.2f} seconds.")
        self.plot_history(history, execution_time)
        self.save()
        return history, execution_time

    def evaluate(self, test_data, test_answers):
        test_loss, test_accuracy = self._model.evaluate(test_data, test_answers, verbose=1)
        print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")
        return test_loss, test_accuracy

    def predict(self, sequence):
        return self._model.predict(sequence)

    def save(self, path='/content/drive/MyDrive/main/languages/Python/neural_network/labs/lab_7/model/lstm_model.keras'):
        self._model.save(path)

    @staticmethod
    def plot_history(history, execution_time):
        epochs = len(history.history['loss'])
        plt.figure(figsize=(12, 4))

        plt.subplot(1, 2, 1)
        plt.plot(range(1, epochs + 1), history.history['accuracy'], label='Training Accuracy')
        if 'val_accuracy' in history.history:
            plt.plot(range(1, epochs + 1), history.history['val_accuracy'], label='Validation Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.title(f'Accuracy (Execution Time: {execution_time:.2f} seconds)')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(range(1, epochs + 1), history.history['loss'], label='Training Loss')
        if 'val_loss' in history.history:
            plt.plot(range(1, epochs + 1), history.history['val_loss'], label='Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.title('Loss')
        plt.legend()

        plt.tight_layout()
        plt.show()



In [16]:
with tf.device('/GPU:0'):
  reviews_count = 100000
  max_length = 100
  train_data, train_answers, test_data, test_answers, word_count = prepare_data(reviews_count, 0.8, max_length)

  print(f'Vocabulary length: {word_count}')

  lstm = KerasLSTM(word_count, max_length)
  lstm.summary()

  lstm.fit(train_data, train_answers)

  count = 20

  predicting_data = test_data[:count].copy()
  predicting_answers = test_answers[:count].copy().tolist()
  test_data = test_data[count:]
  test_answers = test_answers[count:]

  lstm.evaluate(test_data, test_answers)

  for i, data in enumerate(predicting_data):
      data = data.reshape(1, -1)
      predicted_vec = lstm.predict(data)[0].tolist()
      max_val_in_vec = max(predicted_vec)
      predicted_val_index = predicted_vec.index(max_val_in_vec)
      predicterd_val = predicted_val_index+1
      print(f"real stars: {(predicting_answers[i].index(1)) + 1}; predicted: {predicterd_val}")

class counts before downsampling: [10921  7988 11362 25337 44392]
class counts after downsampling: [7988 7988 7988 7988 7988]
rewiev example: [4, 6, 352, 1807, 1682, 37, 609, 56, 838, 856, 1095, 714, 1155, 7235, 873, 22, 2, 363, 491, 13164, 2003, 181, 417, 250, 231, 991, 37, 3769, 182, 2457, 1150, 505, 155, 1246, 100, 2150, 782, 471, 275, 575, 4, 5431, 8, 403, 16, 7, 145, 10154, 2815, 587, 38, 18, 5256, 822, 1682, 37, 242, 3, 5432, 105, 884, 798, 285, 92, 499, 119, 16, 187, 1690, 10155, 1081, 471, 2]
reviews count: 39940
star example: [0 0 0 1 0]
Vocabulary length: 53723




Epoch 1/100
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 307ms/step - accuracy: 0.4355 - loss: 1.2650 - val_accuracy: 0.5578 - val_loss: 1.0072
Epoch 2/100
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 287ms/step - accuracy: 0.6637 - loss: 0.8061 - val_accuracy: 0.5636 - val_loss: 1.0279


TypeError: object of type 'NoneType' has no len()