## LSTM

In [1]:
import pandas as pd
import numpy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
import statistics

In [2]:
df = pd.read_excel('CRA_train_1200.xlsx')
#df = df.head(100) # для быстрого теста работы кода
df

Unnamed: 0,Id,pr_txt,Категория,Уровень рейтинга
0,1.0,Повышение кредитного рейтинга Акционерного об...,A,A
1,2.0,«Эксперт РА» подтвердил кредитный рейтинг комп...,BB,BB
2,3.0,"НКР повысило кредитный рейтинг ООО ""ОТЭКО-Порт...",A,A
3,4.0,«Эксперт РА» присвоил кредитный рейтинг ПАО «Ф...,AAA,AAA
4,5.0,29 марта 2023 г. Ведущий рейтинговый аналитик ...,BBB,BBB
...,...,...,...,...
1227,,ПОВЫШЕН КРЕДИТНЫЙ РЕЙТИНГ\n\nОбщество с ограни...,B,B
1228,,"Компания ""ООО Мосрегионлифт"" Получила Первый К...",B,B
1229,,«Эксперт РА» поддержал рейтинг «ЧТПЗ» на ровне...,А,А+
1230,,"ООО ""Мосрегионлифт"" Первой Раз Оценено Кредитн...",B,B


In [3]:
df['pr_txt'] = df['pr_txt'].str.lower()

In [4]:
# перекодируем категории
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['Уровень рейтинга'])

# разделим данные на обучающие и тестовые
X_train, X_test, y_train, y_test = train_test_split(df['pr_txt'], df['category_encoded'], test_size=0.2, random_state=3)

In [5]:
# конвертируем текстовые данные в tf-idf вектора для будущей обработки моделью
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# посмотрим матрицы
print("X_train_tfidf shape:", X_train_tfidf.shape)
print("X_test_tfidf shape:", X_test_tfidf.shape)

X_train_tfidf shape: (985, 5000)
X_test_tfidf shape: (247, 5000)


In [6]:
# зададим архитектуру модели
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, dropout):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        # Reshape x to (batch_size, sequence_length, input_dim)
        x = x.view(x.size(0), -1, x.size(1))

        h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        out = self.softmax(out)
        return out

# зададим гиперпараметры
input_dim = X_train_tfidf.shape[1]
hidden_dim = 128
output_dim = len(df['Уровень рейтинга'].unique())
n_layers = 2
dropout = 0.2

# зададим нагрузку на устройство
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(input_dim, hidden_dim, output_dim, n_layers, dropout).to(device)

# просмотр архитектуры
print(model)

LSTMClassifier(
  (lstm): LSTM(5000, 128, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=1)
)


In [7]:
# пропишем функцию потерь и оптимизатор
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# конвертируем данные в PyTorch tensors
X_train_tfidf = torch.FloatTensor(X_train_tfidf.toarray()).to(device)
y_train = torch.LongTensor(numpy.array(y_train)).to(device)

criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# тренируем модель
num_epochs = 1000
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    outputs = model(X_train_tfidf)

    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/1000], Loss: 2.8821
Epoch [2/1000], Loss: 2.8789
Epoch [3/1000], Loss: 2.8757
Epoch [4/1000], Loss: 2.8725
Epoch [5/1000], Loss: 2.8692
Epoch [6/1000], Loss: 2.8658
Epoch [7/1000], Loss: 2.8620
Epoch [8/1000], Loss: 2.8583
Epoch [9/1000], Loss: 2.8540
Epoch [10/1000], Loss: 2.8493
Epoch [11/1000], Loss: 2.8442
Epoch [12/1000], Loss: 2.8385
Epoch [13/1000], Loss: 2.8322
Epoch [14/1000], Loss: 2.8254
Epoch [15/1000], Loss: 2.8176
Epoch [16/1000], Loss: 2.8090
Epoch [17/1000], Loss: 2.7994
Epoch [18/1000], Loss: 2.7883
Epoch [19/1000], Loss: 2.7771
Epoch [20/1000], Loss: 2.7641
Epoch [21/1000], Loss: 2.7505
Epoch [22/1000], Loss: 2.7362
Epoch [23/1000], Loss: 2.7193
Epoch [24/1000], Loss: 2.7038
Epoch [25/1000], Loss: 2.6870
Epoch [26/1000], Loss: 2.6716
Epoch [27/1000], Loss: 2.6565
Epoch [28/1000], Loss: 2.6428
Epoch [29/1000], Loss: 2.6320
Epoch [30/1000], Loss: 2.6216
Epoch [31/1000], Loss: 2.6147
Epoch [32/1000], Loss: 2.6106
Epoch [33/1000], Loss: 2.6046
Epoch [34/1000], Lo

Epoch [269/1000], Loss: 0.0156
Epoch [270/1000], Loss: 0.0158
Epoch [271/1000], Loss: 0.0163
Epoch [272/1000], Loss: 0.0150
Epoch [273/1000], Loss: 0.0161
Epoch [274/1000], Loss: 0.0151
Epoch [275/1000], Loss: 0.0152
Epoch [276/1000], Loss: 0.0137
Epoch [277/1000], Loss: 0.0150
Epoch [278/1000], Loss: 0.0150
Epoch [279/1000], Loss: 0.0140
Epoch [280/1000], Loss: 0.0148
Epoch [281/1000], Loss: 0.0140
Epoch [282/1000], Loss: 0.0143
Epoch [283/1000], Loss: 0.0137
Epoch [284/1000], Loss: 0.0146
Epoch [285/1000], Loss: 0.0141
Epoch [286/1000], Loss: 0.0129
Epoch [287/1000], Loss: 0.0123
Epoch [288/1000], Loss: 0.0126
Epoch [289/1000], Loss: 0.0132
Epoch [290/1000], Loss: 0.0132
Epoch [291/1000], Loss: 0.0129
Epoch [292/1000], Loss: 0.0132
Epoch [293/1000], Loss: 0.0118
Epoch [294/1000], Loss: 0.0135
Epoch [295/1000], Loss: 0.0119
Epoch [296/1000], Loss: 0.0118
Epoch [297/1000], Loss: 0.0120
Epoch [298/1000], Loss: 0.0123
Epoch [299/1000], Loss: 0.0111
Epoch [300/1000], Loss: 0.0117
Epoch [3

Epoch [535/1000], Loss: 0.0026
Epoch [536/1000], Loss: 0.0030
Epoch [537/1000], Loss: 0.0037
Epoch [538/1000], Loss: 0.0025
Epoch [539/1000], Loss: 0.0027
Epoch [540/1000], Loss: 0.0026
Epoch [541/1000], Loss: 0.0029
Epoch [542/1000], Loss: 0.0026
Epoch [543/1000], Loss: 0.0025
Epoch [544/1000], Loss: 0.0026
Epoch [545/1000], Loss: 0.0030
Epoch [546/1000], Loss: 0.0026
Epoch [547/1000], Loss: 0.0027
Epoch [548/1000], Loss: 0.0030
Epoch [549/1000], Loss: 0.0026
Epoch [550/1000], Loss: 0.0024
Epoch [551/1000], Loss: 0.0026
Epoch [552/1000], Loss: 0.0026
Epoch [553/1000], Loss: 0.0024
Epoch [554/1000], Loss: 0.0024
Epoch [555/1000], Loss: 0.0027
Epoch [556/1000], Loss: 0.0028
Epoch [557/1000], Loss: 0.0030
Epoch [558/1000], Loss: 0.0027
Epoch [559/1000], Loss: 0.0028
Epoch [560/1000], Loss: 0.0024
Epoch [561/1000], Loss: 0.0024
Epoch [562/1000], Loss: 0.0025
Epoch [563/1000], Loss: 0.0024
Epoch [564/1000], Loss: 0.0025
Epoch [565/1000], Loss: 0.0026
Epoch [566/1000], Loss: 0.0027
Epoch [5

Epoch [801/1000], Loss: 0.0011
Epoch [802/1000], Loss: 0.0012
Epoch [803/1000], Loss: 0.0011
Epoch [804/1000], Loss: 0.0013
Epoch [805/1000], Loss: 0.0010
Epoch [806/1000], Loss: 0.0011
Epoch [807/1000], Loss: 0.0011
Epoch [808/1000], Loss: 0.0015
Epoch [809/1000], Loss: 0.0011
Epoch [810/1000], Loss: 0.0010
Epoch [811/1000], Loss: 0.0011
Epoch [812/1000], Loss: 0.0012
Epoch [813/1000], Loss: 0.0012
Epoch [814/1000], Loss: 0.0013
Epoch [815/1000], Loss: 0.0010
Epoch [816/1000], Loss: 0.0012
Epoch [817/1000], Loss: 0.0010
Epoch [818/1000], Loss: 0.0011
Epoch [819/1000], Loss: 0.0010
Epoch [820/1000], Loss: 0.0011
Epoch [821/1000], Loss: 0.0011
Epoch [822/1000], Loss: 0.0010
Epoch [823/1000], Loss: 0.0010
Epoch [824/1000], Loss: 0.0011
Epoch [825/1000], Loss: 0.0011
Epoch [826/1000], Loss: 0.0010
Epoch [827/1000], Loss: 0.0011
Epoch [828/1000], Loss: 0.0010
Epoch [829/1000], Loss: 0.0012
Epoch [830/1000], Loss: 0.0011
Epoch [831/1000], Loss: 0.0010
Epoch [832/1000], Loss: 0.0010
Epoch [8

In [8]:
# конвертируем данные в последовательность, а затем в тензор
y_test = torch.tensor(numpy.array(y_test), dtype=torch.long).to(device)
X_test_tfidf_dense = torch.FloatTensor(X_test_tfidf.toarray()).to(device)

model.eval()
with torch.no_grad():
    outputs = model(X_test_tfidf_dense)
    _, predicted = torch.max(outputs, 1)

correct = (predicted == y_test).sum().item()
total = y_test.size(0)
accuracy = correct / total * 100
print(f'Test Accuracy: {accuracy:.2f}%')
# оценка точности модели

Test Accuracy: 67.61%


In [9]:
w_f1_score = f1_score(y_test, predicted, average='weighted')
print(f'F1 Score: {w_f1_score:.4f}')

F1 Score: 0.6718


In [10]:
# сохраним модель
torch.save(model, 'model_LSTM.pth')

In [11]:
# та же модель, но для укрупненных категорий
## LSTM (для укрупненных значений)

df['category_encoded1'] = label_encoder.fit_transform(df['Категория'])

# разделим данные на обучающие и тестовые
X_train1, X_test1, y_train1, y_test1 = train_test_split(df['pr_txt'], df['category_encoded1'], test_size=0.2, random_state=3)

# конвертируем текстовые данные в tf-idf вектора для будущей обработки моделью
X_train_tfidf1 = tfidf_vectorizer.fit_transform(X_train1)
X_test_tfidf1 = tfidf_vectorizer.transform(X_test1)

# посмотрим матрицы
print("X_train_tfidf shape:", X_train_tfidf1.shape)
print("X_test_tfidf shape:", X_test_tfidf1.shape)

# зададим гиперпараметры
input_dim = X_train_tfidf1.shape[1]
hidden_dim = 128
output_dim = len(df['Категория'].unique())
n_layers = 2
dropout = 0.2

model1 = LSTMClassifier(input_dim, hidden_dim, output_dim, n_layers, dropout).to(device)

# просмотр архитектуры
print(model1)

# пропишем функцию потерь и оптимизатор
criterion = nn.NLLLoss()
optimizer = optim.Adam(model1.parameters(), lr=0.001)

# конвертируем данные в PyTorch tensors
X_train_tfidf1 = torch.FloatTensor(X_train_tfidf1.toarray()).to(device)
y_train1 = torch.LongTensor(numpy.array(y_train1)).to(device)

criterion = nn.NLLLoss()
optimizer = optim.Adam(model1.parameters(), lr=0.001)

# тренируем модель
num_epochs = 1000
for epoch in range(num_epochs):
    model1.train()
    optimizer.zero_grad()

    outputs = model1(X_train_tfidf)

    loss = criterion(outputs, y_train1)
    loss.backward()
    optimizer.step()

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# конвертируем данные в последовательность, а затем в тензор
y_test1 = torch.tensor(numpy.array(y_test1), dtype=torch.long).to(device)
X_test_tfidf_dense1 = torch.FloatTensor(X_test_tfidf1.toarray()).to(device)

X_train_tfidf shape: (985, 5000)
X_test_tfidf shape: (247, 5000)
LSTMClassifier(
  (lstm): LSTM(5000, 128, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=128, out_features=8, bias=True)
  (softmax): LogSoftmax(dim=1)
)
Epoch [1/1000], Loss: 2.0783
Epoch [2/1000], Loss: 2.0738
Epoch [3/1000], Loss: 2.0692
Epoch [4/1000], Loss: 2.0646
Epoch [5/1000], Loss: 2.0596
Epoch [6/1000], Loss: 2.0543
Epoch [7/1000], Loss: 2.0486
Epoch [8/1000], Loss: 2.0423
Epoch [9/1000], Loss: 2.0356
Epoch [10/1000], Loss: 2.0281
Epoch [11/1000], Loss: 2.0199
Epoch [12/1000], Loss: 2.0109
Epoch [13/1000], Loss: 2.0005
Epoch [14/1000], Loss: 1.9891
Epoch [15/1000], Loss: 1.9765
Epoch [16/1000], Loss: 1.9623
Epoch [17/1000], Loss: 1.9470
Epoch [18/1000], Loss: 1.9305
Epoch [19/1000], Loss: 1.9116
Epoch [20/1000], Loss: 1.8914
Epoch [21/1000], Loss: 1.8713
Epoch [22/1000], Loss: 1.8502
Epoch [23/1000], Loss: 1.8270
Epoch [24/1000], Loss: 1.8051
Epoch [25/1000], Loss: 1.7829
Epoch [26/1000]

Epoch [261/1000], Loss: 0.0067
Epoch [262/1000], Loss: 0.0067
Epoch [263/1000], Loss: 0.0067
Epoch [264/1000], Loss: 0.0066
Epoch [265/1000], Loss: 0.0068
Epoch [266/1000], Loss: 0.0062
Epoch [267/1000], Loss: 0.0065
Epoch [268/1000], Loss: 0.0064
Epoch [269/1000], Loss: 0.0063
Epoch [270/1000], Loss: 0.0060
Epoch [271/1000], Loss: 0.0062
Epoch [272/1000], Loss: 0.0062
Epoch [273/1000], Loss: 0.0062
Epoch [274/1000], Loss: 0.0061
Epoch [275/1000], Loss: 0.0062
Epoch [276/1000], Loss: 0.0059
Epoch [277/1000], Loss: 0.0056
Epoch [278/1000], Loss: 0.0060
Epoch [279/1000], Loss: 0.0057
Epoch [280/1000], Loss: 0.0055
Epoch [281/1000], Loss: 0.0059
Epoch [282/1000], Loss: 0.0059
Epoch [283/1000], Loss: 0.0056
Epoch [284/1000], Loss: 0.0054
Epoch [285/1000], Loss: 0.0054
Epoch [286/1000], Loss: 0.0055
Epoch [287/1000], Loss: 0.0054
Epoch [288/1000], Loss: 0.0053
Epoch [289/1000], Loss: 0.0052
Epoch [290/1000], Loss: 0.0053
Epoch [291/1000], Loss: 0.0056
Epoch [292/1000], Loss: 0.0053
Epoch [2

Epoch [526/1000], Loss: 0.0012
Epoch [527/1000], Loss: 0.0013
Epoch [528/1000], Loss: 0.0013
Epoch [529/1000], Loss: 0.0015
Epoch [530/1000], Loss: 0.0012
Epoch [531/1000], Loss: 0.0012
Epoch [532/1000], Loss: 0.0012
Epoch [533/1000], Loss: 0.0012
Epoch [534/1000], Loss: 0.0011
Epoch [535/1000], Loss: 0.0011
Epoch [536/1000], Loss: 0.0011
Epoch [537/1000], Loss: 0.0010
Epoch [538/1000], Loss: 0.0010
Epoch [539/1000], Loss: 0.0012
Epoch [540/1000], Loss: 0.0012
Epoch [541/1000], Loss: 0.0011
Epoch [542/1000], Loss: 0.0011
Epoch [543/1000], Loss: 0.0012
Epoch [544/1000], Loss: 0.0011
Epoch [545/1000], Loss: 0.0013
Epoch [546/1000], Loss: 0.0011
Epoch [547/1000], Loss: 0.0011
Epoch [548/1000], Loss: 0.0012
Epoch [549/1000], Loss: 0.0010
Epoch [550/1000], Loss: 0.0010
Epoch [551/1000], Loss: 0.0011
Epoch [552/1000], Loss: 0.0010
Epoch [553/1000], Loss: 0.0010
Epoch [554/1000], Loss: 0.0009
Epoch [555/1000], Loss: 0.0010
Epoch [556/1000], Loss: 0.0011
Epoch [557/1000], Loss: 0.0011
Epoch [5

Epoch [792/1000], Loss: 0.0004
Epoch [793/1000], Loss: 0.0005
Epoch [794/1000], Loss: 0.0005
Epoch [795/1000], Loss: 0.0005
Epoch [796/1000], Loss: 0.0005
Epoch [797/1000], Loss: 0.0005
Epoch [798/1000], Loss: 0.0005
Epoch [799/1000], Loss: 0.0004
Epoch [800/1000], Loss: 0.0005
Epoch [801/1000], Loss: 0.0005
Epoch [802/1000], Loss: 0.0005
Epoch [803/1000], Loss: 0.0004
Epoch [804/1000], Loss: 0.0004
Epoch [805/1000], Loss: 0.0005
Epoch [806/1000], Loss: 0.0005
Epoch [807/1000], Loss: 0.0005
Epoch [808/1000], Loss: 0.0004
Epoch [809/1000], Loss: 0.0004
Epoch [810/1000], Loss: 0.0004
Epoch [811/1000], Loss: 0.0005
Epoch [812/1000], Loss: 0.0005
Epoch [813/1000], Loss: 0.0004
Epoch [814/1000], Loss: 0.0005
Epoch [815/1000], Loss: 0.0004
Epoch [816/1000], Loss: 0.0004
Epoch [817/1000], Loss: 0.0004
Epoch [818/1000], Loss: 0.0004
Epoch [819/1000], Loss: 0.0004
Epoch [820/1000], Loss: 0.0005
Epoch [821/1000], Loss: 0.0005
Epoch [822/1000], Loss: 0.0006
Epoch [823/1000], Loss: 0.0005
Epoch [8

In [12]:
model1.eval()
with torch.no_grad():
    outputs = model1(X_test_tfidf_dense1)
    _, predicted1 = torch.max(outputs, 1)

correct1 = (predicted1 == y_test1).sum().item()
total1 = y_test1.size(0)
accuracy1 = correct1 / total1 * 100
print(f'Test Accuracy: {accuracy1:.2f}%')
# оценка точности модели

w_f1_score1 = f1_score(y_test1, predicted1, average='weighted')
print(f'F1 Score: {w_f1_score1:.4f}')

# сохраним модель
torch.save(model1, 'model_LSTM1.pth')

Test Accuracy: 85.02%
F1 Score: 0.8493


In [13]:
print(f'Final Score: {w_f1_score*0.35 + w_f1_score1*0.65:.3f}')

Final Score: 0.787
