In [1]:
import numpy as np
import pandas as pd
import spacy
import re
from collections import Counter
import string
from sklearn .model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
train_file = "train.csv"
data = pd.read_csv(train_file)
print(data.shape)
data.head()

(2834, 6)


Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [None]:
tok = spacy.load('en_core_web_sm')
# python -m spacy download ru_core_news_sm
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [4]:
# Count number of occurences of each word
counts = Counter()
for text in list(data['excerpt']):
    counts.update(tokenize(text))

In [5]:
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 26245
num_words after: 15651


In [6]:
# Creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [7]:
def encode_sentence(text, vocab2index, N=200):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return [encoded, length]

In [8]:
data['encoded'] = data['excerpt'].apply(lambda x: encode_sentence(x, vocab2index))
data.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,encoded
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,"[[2, 3, 4, 5, 6, 7, 3, 1, 8, 9, 10, 11, 12, 13..."
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,"[[104, 105, 106, 107, 8, 108, 8, 109, 20, 110,..."
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,"[[175, 194, 45, 195, 8, 3, 26, 196, 175, 197, ..."
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007,"[[37, 267, 268, 3, 269, 11, 212, 270, 20, 271,..."
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845,"[[343, 344, 11, 107, 205, 48, 345, 346, 347, 3..."


In [9]:
class CommonLitReadabiltyDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        # Проверяем, что X[idx] - это np.ndarray
        x_0 = torch.tensor(self.X[idx][0], dtype=torch.int32)  # Преобразуем в тензор, dtype int32
        y = torch.tensor(self.y[idx], dtype=torch.float32)  # Преобразуем метки в float32
        x_1 = torch.tensor(self.X[idx][1], dtype=torch.float32)  # Если x_1 тоже требуется

        return x_0, y, x_1

In [10]:
X = list(data['encoded'])
y = list(data['target'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

train_ds = CommonLitReadabiltyDataset(X_train, y_train)
valid_ds = CommonLitReadabiltyDataset(X_valid, y_valid)

# fitting

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "C:\Users\User\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\User\AppData\Roaming\Python\Python310\site-packages\traitlets\config\application.py

In [12]:
batch_size = 32
vocab_size = len(words)
embedding_dim = 300
hidden_dim = 200
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [13]:
import torch
import torch.nn.functional as F

# Определение устройства (GPU или CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model_regr(model, epochs=10, lr=0.001):
    # Перемещаем модель на устройство
    model.to(device)
    
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            # Перенос данных на устройство
            x = x.to(device).long()
            y = y.to(device).float()
            l = l.to(device)  # Если l используется в модели, тоже переносим на device
            
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.mse_loss(y_pred, y.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            
            sum_loss += loss.item() * y.shape[0]
            total += y.shape[0]
        
        val_loss = validation_metrics_regr(model, val_dl)
        print("train mse %.3f val rmse %.3f" % (sum_loss / total, val_loss))

def validation_metrics_regr(model, valid_dl):
    # Переносим модель в режим оценки (eval)
    model.eval()
    sum_loss = 0.0
    total = 0
    for x, y, l in valid_dl:
        # Переносим данные на устройство
        x = x.to(device).long()
        y = y.to(device).float()
        l = l.to(device)  # Если l используется в модели, тоже переносим на device
        
        y_hat = model(x, l)
        loss = torch.sqrt(F.mse_loss(y_hat, y.unsqueeze(-1)))

        total += y.shape[0]
        sum_loss += loss.item() * y.shape[0]
    
    return sum_loss / total


In [14]:
class LSTM_regr(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.4)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [15]:
model = LSTM_regr(vocab_size, embedding_dim, hidden_dim).to(device)

In [16]:
train_model_regr(model, epochs=30, lr=0.005)

train mse 1.168 val rmse 1.061
train mse 0.920 val rmse 0.988
train mse 0.714 val rmse 0.907
train mse 0.479 val rmse 0.900
train mse 0.397 val rmse 0.906
train mse 0.307 val rmse 0.901
train mse 0.261 val rmse 0.896
train mse 0.223 val rmse 0.933
train mse 0.177 val rmse 0.918
train mse 0.171 val rmse 0.886
train mse 0.150 val rmse 0.906
train mse 0.132 val rmse 0.884
train mse 0.125 val rmse 0.910
train mse 0.125 val rmse 0.889
train mse 0.103 val rmse 0.900
train mse 0.096 val rmse 0.887
train mse 0.090 val rmse 0.887
train mse 0.081 val rmse 0.876
train mse 0.077 val rmse 0.888
train mse 0.077 val rmse 0.891
train mse 0.068 val rmse 0.880
train mse 0.065 val rmse 0.866
train mse 0.062 val rmse 0.884
train mse 0.068 val rmse 0.883
train mse 0.062 val rmse 0.883
train mse 0.057 val rmse 0.890
train mse 0.058 val rmse 0.887
train mse 0.053 val rmse 0.884
train mse 0.054 val rmse 0.883
train mse 0.053 val rmse 0.896


# inferring

In [17]:
checkpoint_path = "./LSTM_regr_model_with_glove.pth"
torch.save(model, checkpoint_path)

In [36]:
model = torch.load(checkpoint_path)
model.eval()

# Read the test excerpts
test_data = pd.read_csv("test.csv")

# Apply the same encoding as the train texts
test_data['encoded'] = test_data['excerpt'].apply(lambda x: encode_sentence(x, vocab2index))
idx, excerpts_test = test_data['id'], test_data['encoded']

X_test = [excerpts_test[i][0] for i in range(len(test_data))]
l_test = torch.Tensor([excerpts_test[i][1] for i in range(len(test_data))]).to(device)
X_test = torch.LongTensor(X_test).to(device)

In [37]:
# Apply the trained model
with torch.no_grad():
    y_hat = model(X_test, l_test)
test_target = y_hat.reshape(-1).tolist()

my_submission = pd.DataFrame({'id': idx, 'target': test_target})
my_submission.to_csv('submission.csv', index=False)
my_submission.head()

Unnamed: 0,id,target
0,c0f722661,-0.571153
1,f0953f0a5,-0.452467
2,0df072751,-1.363216
3,04caf4e0c,-2.289992
4,0e63f8bea,-1.825588
