In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [None]:
import sqlite3

# Укажите путь к вашей базе данных
db_path = '/content/drive/MyDrive/sentences.db'

# Подключение к базе данных
connection = sqlite3.connect(db_path)

# Создание курсора
cursor = connection.cursor()

# Извлечение всех предложений из таблицы sentences
cursor.execute("SELECT * FROM sentences;")
sentences = cursor.fetchall()

df = pd.DataFrame(sentences, columns=['id', 'Original', 'Altered', 'category'])

In [None]:
df.drop(columns=['category','id'],inplace=True)

In [None]:
df['Altered']
df['word_count'] = df['Altered'].apply(lambda x: len(str(x).split()))

# Рассчитываем среднее количество слов

In [None]:


print(df['word_count'].mean(),df['word_count'].median())

18.931144826904543 16.0


# Creating a model

In [None]:
import torch.nn as nn

class Seq2SeqModel(nn.Module):
  def __init__(self,vocab_size,embedding_dim=256,hidden_dim=256,num_layers=2):
    super(Seq2SeqModel,self).__init__()
    self.embedding = nn.Embedding(vocab_size , embedding_dim)
    self.encoder =nn.LSTM(embedding_dim,hidden_dim , num_layers,batch_first = True,dropout=0.4)
    self.decoder = nn.LSTM(embedding_dim,hidden_dim , num_layers,batch_first = True,dropout=0.4)
    self.fc = nn.Linear(hidden_dim,vocab_size)
  def forward(self,input_ids,target_ids,attention_mask=None):
    embedded = self.embedding(input_ids)
    encoder_outputs , (hidden,cell) = self.encoder(embedded)
    target_embedded = self.embedding(target_ids)
    decoder_outputs, _ = self.decoder(target_embedded,(hidden,cell))
    output = self.fc(decoder_outputs)
    return output

  def generate(self, input_ids, max_length=50, start_token_id=None, end_token_id=None, temperature=1.0):
    embedded = self.embedding(input_ids)
    encoder_outputs, (hidden, cell) = self.encoder(embedded)
    decoder_input = torch.tensor([[start_token_id]], device=input_ids.device)
    output_sentence = []

    for _ in range(max_length):
        decoder_embedded = self.embedding(decoder_input)
        decoder_output, (hidden, cell) = self.decoder(decoder_embedded, (hidden, cell))
        logits = self.fc(decoder_output).squeeze(1)

        probabilities = torch.softmax(logits / temperature, dim=-1)
        next_token_id = torch.multinomial(probabilities, num_samples=1).item()

        # Предотвращаем зацикливание
        output_sentence.append(next_token_id)
        if next_token_id == end_token_id or (len(output_sentence) > 3 and output_sentence[-3:] == [next_token_id] * 3):
            break

        decoder_input[0, 0] = next_token_id  # Обновляем `decoder_input` без создания нового тензора

    return output_sentence


# Preparing data for learning

In [None]:
import torch
from torch.utils.data import DataLoader,Dataset
from transformers import RobertaTokenizer

class TextDataset(Dataset):
  def __init__(self,df,tokinizer , max_length = 50):
    self.df = df
    self.tokenizer = tokinizer
    self.max_length = max_length

  def __len__(self):
    return len(self.df)

  def __getitem__(self,idx):
    input_text = self.df.iloc[idx]['Original']
    target_text = self.df.iloc[idx]['Altered']

    input_encoded = self.tokenizer(
        input_text,
        padding='max_length',
        max_length = self.max_length,
        truncation=True,
        return_tensors = 'pt'
    )
    target_encoded = self.tokenizer(
        target_text,
        padding = 'max_length',
        max_length = self.max_length,
        truncation = True,
        return_tensors = 'pt'
    )

    input_ids = input_encoded['input_ids'].squeeze()
    target_ids = target_encoded['input_ids'].squeeze()

    return input_ids,target_ids

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
dataset = TextDataset(df,tokenizer)
data_loader =DataLoader(dataset,batch_size=64,shuffle=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



# Learning model

In [None]:
import torch.optim as optim
import torch.nn as nn

# Задаем размер словаря и создаем модель
vocab_size = tokenizer.vocab_size
model = Seq2SeqModel(vocab_size).to("cuda")

# Оптимизатор и функция потерь
optimizer = optim.Adam(model.parameters(), lr=0.00005)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Количество эпох
num_epochs = 10

# Переключаем модель в режим обучения
model.train()

# Цикл по эпохам
for epoch in range(num_epochs):
    total_loss = 0
    for input_ids, target_ids in data_loader:
        input_ids = input_ids.to("cuda")  # Перемещаем данные на GPU
        target_ids = target_ids.to("cuda")

        optimizer.zero_grad()  # Обнуляем градиенты

        outputs = model(input_ids, target_ids)  # Прямой проход через модель

        # Вычисляем потерю
        loss = criterion(outputs.view(-1, vocab_size), target_ids.view(-1))

        loss.backward()  # Обратное распространение
        optimizer.step()  # Обновляем веса

        total_loss += loss.item()  # Суммируем потерю для текущей эпохи

    # Выводим среднюю потерю по эпохе
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(data_loader):.4f}")



Epoch 1, Loss: 6.9297
Epoch 2, Loss: 5.2512
Epoch 3, Loss: 4.2629
Epoch 4, Loss: 3.5327
Epoch 5, Loss: 2.9046
Epoch 6, Loss: 2.3461
Epoch 7, Loss: 1.8576
Epoch 8, Loss: 1.4513
Epoch 9, Loss: 1.1316
Epoch 10, Loss: 0.8852
Epoch 11, Loss: 0.6964
Epoch 12, Loss: 0.5514
Epoch 13, Loss: 0.4398
Epoch 14, Loss: 0.3529
Epoch 15, Loss: 0.2851


In [None]:

input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
def correct_text(model, tokenizer, input_text, max_length=50):
    model.eval()  # Переводим модель в режим оценки

    # Токенизируем входной текст
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # Указываем стартовый и конечный токены, если они доступны
    start_token_id = tokenizer.cls_token_id or tokenizer.bos_token_id
    end_token_id = tokenizer.sep_token_id or tokenizer.eos_token_id

    if start_token_id is None or end_token_id is None:
        raise ValueError("Модель или токенизатор не поддерживают стартовые или конечные токены.")

    with torch.no_grad():  # Отключаем вычисление градиентов
        # Получаем предсказание от модели, передавая start и end токены
        output_ids = model.generate(
            input_ids,
            max_length=max_length,
            start_token_id=start_token_id,
            end_token_id=end_token_id
        )

    # Декодируем предсказание в текст
    corrected_text = tokenizer.decode(output_ids, skip_special_tokens=True)
    return corrected_text

# Пример использования
corrected_text = correct_text(model, tokenizer, "If you are not located in the United States, you will has to check the law of the country where you are located before using this ebook.")
print("Исправленный текст:", corrected_text)




Исправленный текст:  interior rivers rivers rivers
