<a href="https://colab.research.google.com/github/davydantoniuk/grammarfix-bot/blob/main/model_gramma_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


df =pd.read_excel('/results.xlsx')

In [6]:
df.dropna(inplace=True)

# **Tokenization and prepering data**




In [7]:
import torch
from transformers import RobertaTokenizer


tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_text(text):
    tokens = tokenizer(text, padding='max_length', max_length=50, truncation=True, return_tensors='pt')
    return tokens.input_ids, tokens.attention_mask

input_tokens = tokenize_text(df['Altered'].tolist())
target_tokens = tokenize_text(df['Original'].tolist())

print("Входные токены с ошибками (Altered):", input_tokens[1])
print("Целевые токены без ошибок (Original):", target_tokens[1])




Входные токены с ошибками (Altered): tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
Целевые токены без ошибок (Original): tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


# Creating a model

In [8]:
import torch.nn as nn

class Seq2SeqModel(nn.Module):
  def __init__(self,vocab_size,embedding_dim=256,hidden_dim=512,num_layers=2):
    super(Seq2SeqModel,self).__init__()
    self.embedding = nn.Embedding(vocab_size , embedding_dim)
    self.encoder =nn.LSTM(embedding_dim,hidden_dim , num_layers,batch_first = True)
    self.decoder = nn.LSTM(embedding_dim,hidden_dim , num_layers,batch_first = True)
    self.fc = nn.Linear(hidden_dim,vocab_size)
  def forward(self,input_ids,target_ids,attention_mask=None):
    embedded = self.embedding(input_ids)
    encoder_outputs , (hidden,cell) = self.encoder(embedded)
    target_embedded = self.embedding(target_ids)
    decoder_outputs, _ = self.decoder(target_embedded,(hidden,cell))
    output = self.fc(decoder_outputs)
    return output

# Preparing data for learning

In [15]:
import torch
from torch.utils.data import DataLoader,Dataset
from transformers import RobertaTokenizer

class TextDataset(Dataset):
  def __init__(self,df,tokinizer , max_length = 50):
    self.df = df
    self.tokenizer = tokinizer
    self.max_length = max_length

  def __len__(self):
    return len(self.df)

  def __getitem__(self,idx):
    input_text = self.df.iloc[idx]['Original']
    target_text = self.df.iloc[idx]['Altered']

    input_encoded = self.tokenizer(
        input_text,
        padding='max_length',
        max_length = self.max_length,
        truncation=True,
        return_tensors = 'pt'
    )
    target_encoded = self.tokenizer(
        target_text,
        padding = 'max_length',
        max_length = self.max_length,
        truncation = True,
        return_tensors = 'pt'
    )

    input_ids = input_encoded['input_ids'].squeeze()
    target_ids = target_encoded['input_ids'].squeeze()

    return input_ids,target_ids

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
dataset = TextDataset(df,tokenizer)
data_loader =DataLoader(dataset,batch_size=32,shuffle=True)

#Trying to extract one batch

for input_ids,target_ids in data_loader:
  print('Input IDs:',input_ids)
  print('Target IDs:',target_ids)
  break

Input IDs: tensor([[    0,   970,    16,  ...,     1,     1,     1],
        [    0,   243,  1382,  ...,     1,     1,     1],
        [    0,    17,    48,  ...,     1,     1,     1],
        ...,
        [    0,  3908,    10,  ...,     1,     1,     1],
        [    0,  1106,    24,  ...,     1,     1,     1],
        [    0,  2387, 14706,  ...,     1,     1,     1]])
Target IDs: tensor([[   0,  970,   16,  ...,    1,    1,    1],
        [   0,  243, 1382,  ...,    1,    1,    1],
        [   0,   17,   48,  ...,    1,    1,    1],
        ...,
        [   0, 3908,   10,  ...,    1,    1,    1],
        [   0, 1106,   24,  ...,    1,    1,    1],
        [   0, 2387,  910,  ...,    1,    1,    1]])


In [16]:
len(input_ids)

32