### Задание 1

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random

In [40]:
# Функция для шифрования текста
def encrypt(text, key):
    indexes = [vocab.index(char) for char in text]
    encrypted_indexes = [(idx + key) % len(vocab) for idx in indexes]
    encrypted_chars = [vocab[idx] for idx in encrypted_indexes]
    encrypted = ''.join(encrypted_chars)
    return encrypted

In [43]:
vocab = [char for char in ' -ABCDEFGHIJKLMNOPQRSTUVWXYZ']
key = 2

message = "RNN IS NOT AI"
encrypted_message = encrypt(message, key)
print(f"Original: {message}")
print(f"Encrypted: {encrypted_message}")

Original: RNN IS NOT AI
Encrypted: TPPAKUAPQVACK


In [46]:
num_examples = 256
seq_len = 18

Определяем функцию для генерации случайнх сообщений, а также определяем архитектуру модели.

In [65]:
def encrypted_dataset(dataset_len, k):
    dataset = []
    for _ in range(dataset_len):
        random_message = ''.join([random.choice(vocab) for _ in range(len(message))])
        encrypt_random_message = encrypt(random_message, k)
        src = [vocab.index(x) for x in random_message]
        tgt = [vocab.index(x) for x in encrypt_random_message]
        dataset.append([torch.tensor(tgt), torch.tensor(src)])
    return dataset

class Decipher(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                 rnn_type='simple'):
        super(Decipher, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        if rnn_type == 'simple':
            self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers = 2)

        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.initial_hidden = torch.zeros(2, 1, hidden_dim)


    def forward(self, cipher):
        embd_x = self.embed(cipher).unsqueeze(1)
        out_rnn, hidden = self.rnn(embd_x, self.initial_hidden)
        return self.fc(out_rnn).transpose(1, 2)


In [57]:
embedding_dim = 5
hidden_dim = 10
vocab_size = len(vocab)
lr = 1e-3

model = Decipher(vocab_size, embedding_dim, hidden_dim)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)

num_epochs = 10
k = 10
for epoch in range(num_epochs):
    for encrypted, original in encrypted_dataset(num_examples, k):

        scores = model(encrypted)
        original = original.unsqueeze(1)
        loss = criterion(scores, original)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        matches, total = 0, 0
        for encrypted, original in encrypted_dataset(num_examples, k):
            predictions = F.softmax(model(encrypted), 1)
            _, batch_out = predictions.max(dim=1)
            batch_out = batch_out.squeeze(1)
            matches += torch.eq(batch_out, original).sum().item()
            total += torch.numel(batch_out)
        accuracy = matches / total
        print(f"Epoch: {epoch+1}, Loss: {loss.item():.4f}, Accuracy: {accuracy * 100:.2f}%")

Epoch: 1, Loss: 2.7739, Accuracy: 18.93%
Epoch: 2, Loss: 2.2886, Accuracy: 32.87%
Epoch: 3, Loss: 1.6931, Accuracy: 70.55%
Epoch: 4, Loss: 1.1841, Accuracy: 81.01%
Epoch: 5, Loss: 0.8830, Accuracy: 82.00%
Epoch: 6, Loss: 0.8641, Accuracy: 89.18%
Epoch: 7, Loss: 0.5512, Accuracy: 95.04%
Epoch: 8, Loss: 0.3005, Accuracy: 100.00%
Epoch: 9, Loss: 0.2580, Accuracy: 100.00%
Epoch: 10, Loss: 0.2239, Accuracy: 100.00%


### Задание 2

Код для парсинга и создания датасета взят из Лаб. 1

In [5]:
import re
import time
import html
from selenium import webdriver
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from wordcloud import WordCloud
import seaborn as sns
import pandas as pd
import nltk
from nltk.corpus import stopwords
import warnings
from PIL import Image
import numpy as np

In [6]:
# БЫСТРЫЙ ПАРСЕР
start = time.time()
url = "https://www.imdb.com/title/tt8579674/reviews"

# Создаем экземпляр браузера
driver = webdriver.Chrome()

# Открываем страницу
driver.get(url)

# Ждем некоторое время, чтобы страница успела загрузиться
time.sleep(2)

# Проскроллим страницу до тех пор, пока кнопка "Load More" не исчезнет
while True:
    try:
        load_more_button = driver.find_element(By.ID, "load-more-trigger")
        if load_more_button.is_displayed():
            load_more_button.click()
            # Ждем некоторое время, чтобы загрузились дополнительные отзывы
            time.sleep(2)
        else:
            break
    except:
        break

# Парсим полную страницу
soup = BeautifulSoup(driver.page_source, "html.parser")
review_containers = soup.find_all("div", class_="review-container")

reviews = {}  # Словарь для хранения уникальных отзывов

for review_container in review_containers:
    review_id = review_container.find("a", class_="title")["href"]  # Идентификатор отзыва

    # Проверка на уникальность отзыва по его идентификатору
    if review_id not in reviews:
        title = review_container.find("a", class_="title").text.strip()
        display_name = review_container.find("span", class_="display-name-link").text.strip()
        review_date = review_container.find("span", class_="review-date").text.strip()
        
        # Попробуйте извлечь текст отзыва внутри контейнера отзыва
        try:
            review_text = review_container.find("div", class_="text show-more__control").get_text(strip=True)
        except AttributeError:
            review_text = review_container.find("div", class_="text show-more__control clickable").get_text(strip=True)
        # Найти элемент с классом "rating-other-user-rating"
        rating_element = review_container.find("span", class_="rating-other-user-rating")

        if rating_element:
            # Извлечь текст из элемента
            rating_text = rating_element.get_text(strip=True)

            # Разбить текст на части, используя "/"
            parts = rating_text.split('/')

            # Проверить, что получилось две части
            if len(parts) == 2:
                rating_value = parts[0].strip()
                # Преобразовать значение оценки в число
                rating = int(rating_value)
                reviews[review_id] = {
                    "Заголовок отзыва": title,
                    "Имя пользователя": display_name,
                    "Дата отзыва": review_date,
                    "Оценка": rating,
                    "Текст отзыва": review_text
                }
            else:
                rating = None
        else:
            rating = None

# Закрываем браузер
driver.quit()

# Преобразование словаря в список
unique_reviews = list(reviews.values())
print(f"Количество отзывов: {len(unique_reviews)}")
print(f"Время выполнения: {time.time() - start} секунд")

Количество отзывов: 3415
Время выполнения: 349.03616881370544 секунд


In [7]:
unique_reviews

[{'Заголовок отзыва': '"He travels the fastest who travels alone"',
  'Имя пользователя': 'TheLittleSongbird',
  'Дата отзыва': '13 April 2021',
  'Оценка': 10,
  'Текст отзыва': "There are some great WW1 films out there, the granddaddy of them all in my view being 1930's 'All Quiet on the Western Front'. Also appreciate many of the actors here, though they are in cameo roles, and Sam Mendes as a director (of the films of his seen, almost all, 'Spectre' is the only one to not do much for me). Roger Deakins and Thomas Newman are masters in their field, Deakins is one of the best cinematographers in the business and Newman's score for 'Road to Perdition' is a favourite.'1917' was seen for all those reasons. As well as because of the critical acclaim, with it being considered as one of 2019's year's best films. After seeing it, my thoughts are that the acclaim for '1917' is richly deserved in one of the best and most powerful films that year. It did connect a lot with me, due to watching 

In [39]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [18]:
df = pd.DataFrame(unique_reviews)
df['Sentiment'] = pd.cut(df['Оценка'], bins=[0, 4, 7, 10], labels=['Негативный', 'Нейтральный', 'Позитивный'])
# Преобразование текста в числовой формат
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Текст отзыва'])
text_sequences = tokenizer.texts_to_sequences(df['Текст отзыва'])

In [32]:
df

Unnamed: 0,Заголовок отзыва,Имя пользователя,Дата отзыва,Оценка,Текст отзыва,Sentiment
0,"""He travels the fastest who travels alone""",TheLittleSongbird,13 April 2021,10,"There are some great WW1 films out there, the ...",Позитивный
1,One Shot Cinematography?,pawanpunjabithewriter,21 August 2023,8,"Watching 1917, the audience might discern its ...",Позитивный
2,Reasonably accurate in showing the hell that i...,planktonrules,10 April 2020,9,I am not particularly a huge fan of war films....,Позитивный
3,Another Day In One Long Take,boblipton,10 January 2020,8,The command has realized that the Germans have...,Позитивный
4,An achievement,aivilovee,12 December 2019,9,It's a stunning watch from start to finish. Th...,Позитивный
...,...,...,...,...,...,...
3410,Very surprising film,fitriaaoct,29 January 2020,10,"Very unpredictable, the film is arranged neatl...",Позитивный
3411,So beautiful and sad,dannyfqm,25 January 2020,10,I was totally in the film and by the end have ...,Позитивный
3412,Truely remarkable,schillingsf,1 February 2020,9,"Such an amazing movie, the fact that it's all ...",Позитивный
3413,Moving reality of WW1,munashe-11372,14 January 2020,9,"Incredibly moving reality of WW1, no Hollywood...",Позитивный


In [23]:
# Преобразование целевой переменной в формат, пригодный для обучения модели
encoder = OneHotEncoder(sparse=False, categories='auto', dtype=int)
labels = encoder.fit_transform(df['Sentiment'].values.reshape(-1, 1))

In [25]:
X_train, X_test, y_train, y_test = train_test_split(text_sequences, labels, test_size=0.2, random_state=42)

In [26]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50
max_length = 100

In [27]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))  # 3 класса: Негативный, Нейтральный, Позитивный

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(pad_sequences(X_train, maxlen=max_length, padding='post'),
          y_train,
          epochs=10,
          validation_data=(pad_sequences(X_test, maxlen=max_length, padding='post'), y_test),
          callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.src.callbacks.History at 0x24ead598790>

In [28]:
# Оценка модели
y_pred = model.predict(pad_sequences(X_test, maxlen=max_length, padding='post'))
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

print("Classification Report:")
print(classification_report(y_test_classes, y_pred_classes))

Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.01      0.02        82
           1       0.44      0.21      0.29       142
           2       0.72      0.97      0.83       459

    accuracy                           0.69       683
   macro avg       0.50      0.40      0.38       683
weighted avg       0.62      0.69      0.62       683



Балансировка классов

In [33]:
class_weights = compute_class_weight('balanced', classes=np.unique(df['Sentiment']), y=df['Sentiment'])
class_weight_dict = dict(enumerate(class_weights))

In [34]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50
max_length = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))

In [35]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(pad_sequences(X_train, maxlen=max_length, padding='post'),
          y_train,
          epochs=10,
          validation_data=(pad_sequences(X_test, maxlen=max_length, padding='post'), y_test),
          callbacks=[early_stopping],
          class_weight=class_weight_dict)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


<keras.src.callbacks.History at 0x24eadbc2280>

In [36]:
y_pred = model.predict(pad_sequences(X_test, maxlen=max_length, padding='post'))
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

print("Classification Report with Class Balancing:")
print(classification_report(y_test_classes, y_pred_classes))

Classification Report with Class Balancing:
              precision    recall  f1-score   support

           0       0.28      0.65      0.40        82
           1       0.40      0.27      0.32       142
           2       0.91      0.80      0.85       459

    accuracy                           0.67       683
   macro avg       0.53      0.57      0.52       683
weighted avg       0.73      0.67      0.68       683



##### Выводы:
Результаты обучения с применением RNN значительно лучше, чем при обучении с использованием классических алгоритмов машинного обучения. После добваления параметра "вес класса" результат стал незначительно лучше, чем до использования этого параметра. Это говорит о том, что в данных наблюдается сильный дисбаланс классов и модель сосредотачивалась на более представленных классах и плохо обобщалась на менее представленных. Этот параметр позволил взвесить веса классов и присвоить большие веса менее представленным классам и меньшие веса более представленным классам, что в положительную сторону сказалось на результатах работа модели на тестовом наборе данных.

### Задача 3

In [3]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist

In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

nltk.download('punkt')

with open('dostoevsky.txt', 'r', encoding='utf-8') as file:
    text = file.read()

words = word_tokenize(text, language='russian')
sentence_count = text.count('.') + text.count('!') + text.count('?')

words_lower = [word.lower() for word in words]

freq_dist = FreqDist(words_lower)

# Подсчет частоты для каждого слова
word_freq_besy = freq_dist['бесы'] / len(words)
word_freq_semya = freq_dist['семья'] / len(words)
word_freq_brat = freq_dist['брат'] / len(words)

print(f"Частота слова 'бесы': {word_freq_besy:.5f}")
print(f"Частота слова 'семья': {word_freq_semya:.5f}")
print(f"Частота слова 'брат': {word_freq_brat:.5f}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Drama\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Частота слова 'бесы': 0.00001
Частота слова 'семья': 0.00001
Частота слова 'брат': 0.00024


In [10]:
print(f"Количество слов: {len(words)}")
print(f"Количество предложений: {len(sentences)}")

Количество слов: 2303779
Количество предложений: 121743


In [29]:
generated_text = ' '.join([word for word in words if random.uniform(0, 1) >= 0.85 and random.uniform(0, 1) <= 0.95])

final_sentence = ' '.join(generated_text.split()[:10])

print("\nИтоговое предложение из 10 слов:")
print(final_sentence)


Итоговое предложение из 10 слов:
написать приятное подноготную земле вырывают а дребедень и ; Ф.


# Часть 2

In [34]:
from navec import Navec

# Загрузка эмбеддингов
navec = Navec.load('navec_hudlit_v1_12B_500K_300d_100q.tar')

In [40]:
from natasha import NewsEmbedding, NewsMorphTagger, Doc

# Загрузка эмбеддингов fasttext
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)


In [45]:
from natasha import Segmenter

# Инициализация сегментатора
segmenter = Segmenter()

def get_embeddings(text, morph_tagger):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    embeddings_unigrams = [token.vector for token in doc.tokens]
    
    bigrams = list(zip(doc.tokens, doc.tokens[1:]))
    embeddings_bigrams = [token.vector + next_token.vector for token, next_token in bigrams]

    return embeddings_unigrams, embeddings_bigrams

# Загрузите тексты Достоевского и создайте эмбеддинги
with open('dostoevsky.txt', 'r', encoding='utf-8') as file:
    dostoevsky_text = file.read()

embeddings_unigrams, embeddings_bigrams = get_embeddings(dostoevsky_text, morph_tagger)

AttributeError: 'DocToken' object has no attribute 'vector'