In [1]:
import numpy as np 
import pandas as pd 
import os
import time

In [2]:
import optuna
import optuna.logging

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier 
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
import nltk 
import spacy

Cоберем отзывы из обучающей и тестовой выборки в датафреймы:

In [4]:
neg_folder = '/kaggle/input/avaliacoes-de-filmes-dataset-imdb/aclImdb/train/neg'
pos_folder = '/kaggle/input/avaliacoes-de-filmes-dataset-imdb/aclImdb/train/pos'

test_neg = '/kaggle/input/avaliacoes-de-filmes-dataset-imdb/aclImdb/test/neg'
test_pos = '/kaggle/input/avaliacoes-de-filmes-dataset-imdb/aclImdb/test/pos'

In [5]:
data = []

# Обработка отрицательных отзывов
for filename in os.listdir(neg_folder):
    if filename.endswith('.txt'):
        id_rating = filename.split('.')[0] 
        id, rating = id_rating.split('_')    
        with open(os.path.join(neg_folder, filename), 'r', encoding='utf-8') as f:
            review = f.read()  # Читаем текст рецензии
        data.append([id, review, int(rating), 0])  # 0 для neg

# Обработка положительных отзывов
for filename in os.listdir(pos_folder):
    if filename.endswith('.txt'):
        id_rating = filename.split('.')[0]  
        id, rating = id_rating.split('_')    
        with open(os.path.join(pos_folder, filename), 'r', encoding='utf-8') as f:
            review = f.read()  # Читаем текст рецензии
        data.append([id, review, int(rating), 1])  # 1 для pos
        

df = pd.DataFrame(data, columns=['id', 'review', 'rating', 'status'])
df.to_csv('train.csv', index=False)

In [6]:
test_data = []

# Обработка отрицательных тестовых отзывов
for filename in os.listdir(test_neg):
    if filename.endswith('.txt'):
        id_rating = filename.split('.')[0]
        id, rating = id_rating.split('_')
        with open(os.path.join(test_neg, filename), 'r', encoding='utf-8') as f:
            review = f.read()  # Читаем текст рецензии
        test_data.append([id, review, int(rating), 0])  # 0 для neg

# Обработка положительных тестовых отзывов
for filename in os.listdir(test_pos):
    if filename.endswith('.txt'):
        id_rating = filename.split('.')[0]
        id, rating = id_rating.split('_')
        with open(os.path.join(test_pos, filename), 'r', encoding='utf-8') as f:
            review = f.read()  # Читаем текст рецензии
        test_data.append([id, review, int(rating), 1])  # 1 для pos

# Создаём DataFrame и сохраняем в CSV
test_df = pd.DataFrame(test_data, columns=['id', 'review', 'rating', 'status'])
test_df.to_csv('test.csv', index=False)

In [7]:
train_df = pd.read_csv('/kaggle/working/train.csv')
train_df.sample(7)

Unnamed: 0,id,review,rating,status
3686,5682,This movie is unbelievably ridiculous. I love ...,1,0
16226,7083,A great addition to anyone's collection.<br />...,10,1
3819,7536,I'd like to start off by saying that I am NOT ...,1,0
12439,10793,I saw this film at the 2005 Toronto Internatio...,4,0
964,4765,"While the original titillates the intellect, t...",2,0
10883,11682,Now isn't it? Considering all the good work do...,1,0
13306,135,It was an excellent piece to the puppet series...,7,1


In [8]:
train_df = train_df.set_index('id')

train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25000 entries, 3606 to 8111
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  25000 non-null  object
 1   rating  25000 non-null  int64 
 2   status  25000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 781.2+ KB


In [9]:
test_df = pd.read_csv('/kaggle/working/test.csv')
test_df = test_df.set_index('id')
test_df.sample(7)

Unnamed: 0_level_0,review,rating,status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11974,... with a 500$ budget and a bottle of ketchup...,2,0
5809,"Years ago, with ""Ray of Light,"" Madonna broke ...",10,1
4962,I am a German student so sorry for eventually ...,4,0
9703,This movie is just about as good as the first ...,7,1
1714,This movie (even calling it a movie is an over...,1,0
1436,STAR RATING: ***** Saturday Night **** Friday ...,9,1
8811,There were so many things wrong with this movi...,1,0


In [10]:
train_df.describe()

Unnamed: 0,rating,status
count,25000.0,25000.0
mean,5.47772,0.5
std,3.466477,0.50001
min,1.0,0.0
25%,2.0,0.0
50%,5.5,0.5
75%,9.0,1.0
max,10.0,1.0


In [11]:
train_df.status.value_counts()

status
0    12500
1    12500
Name: count, dtype: int64

Удалось обработать файлы. Переходим к подготовке данных.

## Подготовка данных для обучения моделей

In [12]:
train_df['review']

id
3606     This film is the worst film, but it ranks very...
1074     I should never have started this film, and sto...
4743     I'm here again in your local shopping mall (of...
7628     Black and White film. Good photography. Believ...
6812     from the start of this movie you soon become a...
                               ...                        
3156     Like the characters in this show, I too was a ...
4019     Being that I am not a fan of Snoop Dogg, as an...
12380    NYC model Alison Parker (Cristina Raines) rent...
6290     Using Buster Keaton in the twilight of his car...
8111     Things to Come is indeed a classic work of spe...
Name: review, Length: 25000, dtype: object

In [13]:
X_train, X_val, y_train, y_val = train_test_split(train_df['review'], train_df['status'], 
                                                  test_size=0.2, random_state=42)

# Преобразование текста в числовые признаки с помощью TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

## LogReg

In [17]:
# optuna.logging.set_verbosity(optuna.logging.ERROR)

def objective(trial):
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)
    solver = trial.suggest_categorical('solver', ['saga', 'lbfgs'])
    penalty = trial.suggest_categorical('penalty', ['l2', 'none'])  
    
    model = LogisticRegression(C=C, solver=solver, penalty=penalty, random_state=42)
    model.fit(X_train_tfidf, y_train)
    
    y_pred = model.predict(X_val_tfidf)
    
    return accuracy_score(y_val, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20) 
print(f'Best trial: {study.best_trial.value}')
print(f'Best parameters: {study.best_trial.params}')

best_params = study.best_trial.params

# Обучение модели и замер времени
start_time = time.time()

model = LogisticRegression(**best_params)
model.fit(X_train_tfidf, y_train)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Время обучения модели: {elapsed_time:.4f} секунд")
# Оценка модели на валидационных данных
y_pred = model.predict(X_val_tfidf)
accuracy_logreg = accuracy_score(y_val, y_pred)
print(f'logreg Accuracy: {accuracy_logreg * 100:.2f}%')

[I 2024-10-12 15:13:38,845] A new study created in memory with name: no-name-2250eb0e-a30f-44eb-b439-af5f72c03847
[I 2024-10-12 15:13:44,263] Trial 0 finished with value: 0.8502 and parameters: {'C': 0.011136217477152753, 'solver': 'saga', 'penalty': 'none'}. Best is trial 0 with value: 0.8502.
[I 2024-10-12 15:13:45,952] Trial 1 finished with value: 0.8346 and parameters: {'C': 0.09609208905204437, 'solver': 'lbfgs', 'penalty': 'none'}. Best is trial 0 with value: 0.8502.
[I 2024-10-12 15:13:46,112] Trial 2 finished with value: 0.8114 and parameters: {'C': 0.01789939540294499, 'solver': 'lbfgs', 'penalty': 'l2'}. Best is trial 0 with value: 0.8502.
[I 2024-10-12 15:13:47,784] Trial 3 finished with value: 0.8346 and parameters: {'C': 3.0175430816946793, 'solver': 'lbfgs', 'penalty': 'none'}. Best is trial 0 with value: 0.8502.
[I 2024-10-12 15:13:49,463] Trial 4 finished with value: 0.8346 and parameters: {'C': 8.341396333233982, 'solver': 'lbfgs', 'penalty': 'none'}. Best is trial 0 w

Best trial: 0.8824
Best parameters: {'C': 0.9207920910663495, 'solver': 'saga', 'penalty': 'l2'}
Время обучения модели: 1.0400 секунд
Final logreg Accuracy: 88.24%


## SGDClassifier

Ранее были найдены лучшие гиперпараметры, явно их задаю при инициализации.

In [14]:
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=42, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False)

start_time = time.time()
clf.fit(X_train_tfidf, y_train)
end_time = time.time()
print(f'Время обучения модели: {end_time - start_time}')

y_pred = clf.predict(X_val_tfidf)

accuracy_sgdc = accuracy_score(y_val, y_pred)
print(f'sgdc Accuracy: {accuracy_sgdc * 100:.2f}%')

Время обучения модели: 0.23019957542419434
sgdc Accuracy: 88.60%


****SGDClassifier показывает лучшую точность на валидационной выборке и меньшее время обучения самой модели.****

Далее пробую нелинейные модели.

## Multinomial Naive Bayes

Наивный Байес — это быстрая и простая модель, часто используемая для текстовой классификации, так как она работает хорошо с дискретными признаками, такими как частоты слов.

In [23]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-4, 1.0, log=True)
    fit_prior = trial.suggest_categorical('fit_prior', [True, False])
    
    model = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
    model.fit(X_train_tfidf, y_train)
    
    y_pred = model.predict(X_val_tfidf)
    
    return accuracy_score(y_val, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20) 
print(f'Best trial: {study.best_trial.value}')
print(f'Best parameters: {study.best_trial.params}')

best_params = study.best_trial.params

# Обучение модели и замер времени
start_time = time.time()

model = MultinomialNB(**best_params)
model.fit(X_train_tfidf, y_train)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Время обучения модели: {elapsed_time:.4f} секунд")

# Оценка модели на валидационных данных
y_pred = model.predict(X_val_tfidf)
accuracy_nb = accuracy_score(y_val, y_pred)
print(f'MultinomialNB Accuracy: {accuracy_nb * 100:.2f}%')

Best trial: 0.8486
Best parameters: {'alpha': 0.3702526824270394, 'fit_prior': False}
Время обучения модели: 0.0226 секунд
MultinomialNB Accuracy: 84.86%


Модель действительно оказалась в 10 раз быстрее, но менее точной на валидационных данных.

## SVC

SVM — это мощная модель для классификации, особенно если данные нелинейно разделимы. Она хорошо работает с высокоразмерными признаковыми пространствами, такими как текстовые данные.

In [None]:
def objective(trial):
    C = trial.suggest_loguniform('C', 1e-2, 1e1)
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
    
    svm_model = SVC(kernel=kernel, C=C, random_state=42)
    svm_model.fit(X_train_tfidf, y_train)
    
    y_pred_svm = svm_model.predict(X_val_tfidf)
    return accuracy_score(y_val, y_pred_svm)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)  # Увеличил до 10, чтобы собрать больше данных, но быстрее

print("Best parameters:", study.best_params)
print("Best accuracy svc:", study.best_value)

[I 2024-10-08 17:18:44,412] A new study created in memory with name: no-name-30b437b4-fb14-49bf-b0f6-94d560a93f5a
[I 2024-10-08 17:27:21,368] Trial 0 finished with value: 0.8498 and parameters: {'C': 0.04190017106812743, 'kernel': 'linear'}. Best is trial 0 with value: 0.8498.
[I 2024-10-08 17:35:13,306] Trial 1 finished with value: 0.8704 and parameters: {'C': 4.335002014165139, 'kernel': 'linear'}. Best is trial 1 with value: 0.8704.
[I 2024-10-08 17:58:05,597] Trial 2 finished with value: 0.885 and parameters: {'C': 3.351868285046361, 'kernel': 'rbf'}. Best is trial 2 with value: 0.885.
[I 2024-10-08 18:08:40,226] Trial 3 finished with value: 0.8132 and parameters: {'C': 0.017838097199152823, 'kernel': 'linear'}. Best is trial 2 with value: 0.885.
[I 2024-10-08 18:14:51,532] Trial 4 finished with value: 0.8844 and parameters: {'C': 0.327478680673575, 'kernel': 'linear'}. Best is trial 2 with value: 0.885.
[I 2024-10-08 18:24:22,632] Trial 5 finished with value: 0.88 and parameters: 

При долгом подборе гиперпараметров была достигнута та же точность, что и у SGDClassifier. Однако SVC обучается дольше, в езультате оптимальнее выбрать для бинарной классификации отзывов модель SGDClassifier.

****Тест SGDClassifier****

In [15]:
from sklearn.metrics import classification_report

In [16]:
X_test_tfidf = tfidf_vectorizer.transform(test_df['review'])

y_test_pred = clf.predict(X_test_tfidf)

In [17]:
# Оценка точности модели на тестовом наборе
y_test_accuracy = accuracy_score(test_df['status'], y_test_pred)
print(f'Test set SGDC Accuracy: {y_test_accuracy * 100:.2f}%')
print(classification_report(test_df['status'], y_test_pred))

Test set SGDC Accuracy: 87.99%
              precision    recall  f1-score   support

           0       0.88      0.87      0.88     12500
           1       0.88      0.89      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



### Сохранение модели и векторизатора

In [28]:
   import joblib

In [29]:
   # Сохранение модели
   joblib.dump(clf, 'sgd_classifier_model.joblib')

   # Сохранение TF-IDF векторизатора
   joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']

# Модель предсказания рейтинга (многоклассовая классификация)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

from catboost import CatBoostClassifier, Pool
from collections import Counter
import numpy as np
import joblib

Здесь весомая проблема дисбаланса классов и отсутствия нейтральных оценок.

In [14]:
train_df.rating.value_counts()

rating
1     5100
10    4732
8     3009
4     2696
7     2496
3     2420
2     2284
9     2263
Name: count, dtype: int64

In [15]:
# Получаем частоты классов
class_counts = dict(Counter(train_df['rating']))

# Вычисляем веса как обратные пропорции частот классов
total_samples = len(train_df)
class_weights = {rating: total_samples / count for rating, count in class_counts.items()}

# Преобразуем в список весов для CatBoost
sorted_weights = [class_weights[i] for i in sorted(class_weights.keys())]

In [20]:
# Обучение модели
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    class_weights=sorted_weights,
    text_features=['review'], 
    eval_metric='MultiClass',
)

model.fit(train_df[['review']], train_df['rating'])

0:	learn: 1.9952692	total: 1.46s	remaining: 12m 9s
1:	learn: 1.9127988	total: 2.84s	remaining: 11m 47s
2:	learn: 1.8561137	total: 4.13s	remaining: 11m 23s
3:	learn: 1.8084333	total: 5.4s	remaining: 11m 9s
4:	learn: 1.7596127	total: 6.68s	remaining: 11m
5:	learn: 1.7217445	total: 7.97s	remaining: 10m 56s
6:	learn: 1.6831153	total: 9.37s	remaining: 10m 59s
7:	learn: 1.6426143	total: 10.8s	remaining: 11m 1s
8:	learn: 1.6118799	total: 12.2s	remaining: 11m 5s
9:	learn: 1.5864808	total: 13.6s	remaining: 11m 5s
10:	learn: 1.5652737	total: 15s	remaining: 11m 7s
11:	learn: 1.5447988	total: 16.4s	remaining: 11m 4s
12:	learn: 1.5259788	total: 17.8s	remaining: 11m 5s
13:	learn: 1.5129765	total: 19.1s	remaining: 11m 2s
14:	learn: 1.5002683	total: 20.4s	remaining: 10m 59s
15:	learn: 1.4911849	total: 21.7s	remaining: 10m 56s
16:	learn: 1.4809674	total: 23.4s	remaining: 11m 4s
17:	learn: 1.4695158	total: 24.7s	remaining: 11m 1s
18:	learn: 1.4576505	total: 26.1s	remaining: 11m 1s
19:	learn: 1.4531958	t

<catboost.core.CatBoostClassifier at 0x79c5e65b5300>

In [21]:
# Предсказание на тестовом наборе
y_pred = model.predict(test_df[['review']]) 

print(classification_report(test_df['rating'], y_pred))

              precision    recall  f1-score   support

           1       0.88      0.01      0.01      5022
           2       0.62      0.00      0.01      2302
           3       0.48      0.01      0.01      2541
           4       0.29      0.00      0.00      2635
           7       0.13      0.52      0.21      2307
           8       0.20      0.25      0.23      2850
           9       0.22      0.29      0.25      2344
          10       0.34      0.62      0.44      4999

    accuracy                           0.23     25000
   macro avg       0.39      0.21      0.14     25000
weighted avg       0.44      0.23      0.16     25000



## Ансамблирование

In [13]:
import torch
torch.cuda.is_available()

True

In [14]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler

# Определяем модели с использованием GPU для CatBoost
catboost_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    eval_metric='MultiClass',
    task_type='GPU',  # Использование GPU
    silent=True
)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Создаем pipeline для каждой модели
catboost_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('catboost', catboost_model)
])

rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('random_forest', rf_model)
])

lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logistic_regression', lr_model)
])

# Создаем ансамбль
voting_clf = VotingClassifier(estimators=[
    ('catboost', catboost_pipeline),
    ('random_forest', rf_pipeline),
    ('logistic_regression', lr_pipeline)
], voting='soft')

# Создаем undersampler
undersampler = RandomUnderSampler(random_state=42)

# Применение undersampling к обучающему набору
X_resampled, y_resampled = undersampler.fit_resample(train_df[['review']], train_df['rating'])

# Обучение ансамбля на сбалансированном наборе
voting_clf.fit(X_resampled['review'], y_resampled)

# Предсказание на тестовом наборе
y_pred = voting_clf.predict(test_df['review'])

print(classification_report(test_df['rating'], y_pred))

              precision    recall  f1-score   support

           1       0.64      0.60      0.62      5022
           2       0.22      0.25      0.23      2302
           3       0.24      0.21      0.22      2541
           4       0.31      0.32      0.32      2635
           7       0.30      0.36      0.33      2307
           8       0.27      0.21      0.24      2850
           9       0.23      0.25      0.24      2344
          10       0.55      0.56      0.55      4999

    accuracy                           0.39     25000
   macro avg       0.34      0.35      0.34     25000
weighted avg       0.39      0.39      0.39     25000



In [18]:
import joblib

# Сохраняем ансамбль
joblib.dump(voting_clf, 'voting_classifier_model.joblib')

# Сохраняем TfidfVectorizer отдельно 
# joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer2.joblib') 


['voting_classifier_model.joblib']

In [18]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler

# Определяем модели с использованием GPU для CatBoost
catboost_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    eval_metric='MultiClass',
    task_type='GPU',  # Использование GPU
    silent=True
)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Создаем pipeline для каждой модели
catboost_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('catboost', catboost_model)
])

rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('random_forest', rf_model)
])

lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logistic_regression', lr_model)
])

# Создаем ансамбль
voting_clf = VotingClassifier(estimators=[
    ('catboost', catboost_pipeline),
    ('random_forest', rf_pipeline),
    ('logistic_regression', lr_pipeline)
], voting='soft')

# Создаем oversampler
oversampler = RandomOverSampler(random_state=42)

# Применение oversampling к обучающему набору
X_resampled, y_resampled = oversampler.fit_resample(train_df[['review']], train_df['rating'])

# Обучение ансамбля на увеличенном наборе данных
voting_clf.fit(X_resampled['review'], y_resampled)

# Предсказание на тестовом наборе
y_pred = voting_clf.predict(test_df['review'])

# Оценка производительности
print(classification_report(test_df['rating'], y_pred))

              precision    recall  f1-score   support

           1       0.58      0.73      0.64      5022
           2       0.23      0.14      0.17      2302
           3       0.27      0.18      0.22      2541
           4       0.32      0.34      0.33      2635
           7       0.29      0.29      0.29      2307
           8       0.27      0.24      0.26      2850
           9       0.24      0.14      0.17      2344
          10       0.52      0.68      0.59      4999

    accuracy                           0.42     25000
   macro avg       0.34      0.34      0.33     25000
weighted avg       0.38      0.42      0.39     25000



In [16]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# Определяем модели с использованием GPU для CatBoost
catboost_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    eval_metric='MultiClass',
    task_type='GPU',  # Использование GPU
    silent=True
)

# Упрощенная модель RandomForest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Создаем pipeline для каждой модели
catboost_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('catboost', catboost_model)
])

rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('random_forest', rf_model)
])

# Создаем ансамбль
voting_clf = VotingClassifier(estimators=[
    ('catboost', catboost_pipeline),
    ('random_forest', rf_pipeline)
], voting='soft')

# Обучение ансамбля без oversampling
voting_clf.fit(train_df['review'], train_df['rating'])

# Предсказание на тестовом наборе
y_pred = voting_clf.predict(test_df['review'])

# Оценка производительности
print(classification_report(test_df['rating'], y_pred))

KeyboardInterrupt: 

## Стекинг

In [17]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler

# Определяем модели с использованием GPU для CatBoost
catboost_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    eval_metric='MultiClass',
    task_type='GPU',  # Использование GPU
    silent=True,
    thread_count=1
)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Создаем pipeline для каждой модели
catboost_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('catboost', catboost_model)
])

rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('random_forest', rf_model)
])

lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logistic_regression', lr_model)
])

stacking_clf = StackingClassifier(
    estimators=[
        ('catboost', catboost_pipeline),
        ('random_forest', rf_pipeline),
        ('logistic_regression', lr_pipeline)
    ],
    final_estimator=LogisticRegression(),
    cv=5,  # Кросс-валидация для устойчивости
    n_jobs=1  # Ограничиваем параллельные задачи
)

# Создаем undersampler
undersampler = RandomUnderSampler(random_state=42)

# Применение undersampling к обучающему набору
X_resampled, y_resampled = undersampler.fit_resample(train_df[['review']], train_df['rating'])

# Обучение ансамбля на сбалансированном наборе
stacking_clf.fit(X_resampled['review'], y_resampled)

# Предсказание на тестовом наборе
y_pred = stacking_clf.predict(test_df['review'])

# Оценка производительности
print(classification_report(test_df['rating'], y_pred))

              precision    recall  f1-score   support

           1       0.68      0.57      0.62      5022
           2       0.22      0.27      0.25      2302
           3       0.24      0.21      0.22      2541
           4       0.30      0.40      0.34      2635
           7       0.28      0.39      0.33      2307
           8       0.27      0.19      0.23      2850
           9       0.23      0.26      0.24      2344
          10       0.59      0.53      0.56      4999

    accuracy                           0.39     25000
   macro avg       0.35      0.35      0.35     25000
weighted avg       0.41      0.39      0.40     25000



UnderSampler не помог в качественном росте точности минорных классов. Лучше оказался OverSampling в VotingClassifier.

## Sequential

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler

# Данные
X_train, X_test, y_train, y_test = train_test_split(train_df['review'], train_df['rating'], test_size=0.2, random_state=42)

# Применение TF-IDF векторизации
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

# Создаем oversampler
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_train_tfidf, y_train)

# Преобразуем метки от 1-4 и 7-10 в 0-7
y_resampled = y_resampled.apply(lambda x: x - 1 if x <= 4 else x - 3)
y_test = y_test.apply(lambda x: x - 1 if x <= 4 else x - 3)

# Преобразование данных в тензоры
X_resampled_tensor = torch.tensor(X_resampled, dtype=torch.float32)
y_resampled_tensor = torch.tensor(y_resampled.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_tfidf, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Создание DataLoader для обучения
train_dataset = TensorDataset(X_resampled_tensor, y_resampled_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Параметры модели
input_size = X_resampled_tensor.shape[1]  # Количество признаков TF-IDF
output_size = 8  # Количество классов (оценки от 1 до 10, но без 5 и 6)

# Определение улучшенной модели
class ImprovedReviewClassifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(ImprovedReviewClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),  # Dropout для регуляризации
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 128),
            nn.LeakyReLU(),
            nn.Linear(128, output_size)
        )
    
    def forward(self, x):
        return self.model(x)

# Инициализация модели
model = ImprovedReviewClassifier(input_size, output_size)

# Определение функции потерь и оптимизатора
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # L2-регуляризация (weight_decay)

# Тренировка модели
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")

# Оценка модели
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predictions = torch.max(outputs, 1)

# Вывод отчета по классификации
print(classification_report(y_test_tensor, predictions))

Epoch 1/20, Loss: 1.5386286430396212
Epoch 2/20, Loss: 1.1524381666734027
Epoch 3/20, Loss: 0.7181028331212568
Epoch 4/20, Loss: 0.37502939121363915
Epoch 5/20, Loss: 0.2070683382028703
Epoch 6/20, Loss: 0.12488727609457452
Epoch 7/20, Loss: 0.08813142536824055
Epoch 8/20, Loss: 0.07378789157480906
Epoch 9/20, Loss: 0.06456709115942776
Epoch 10/20, Loss: 0.05243718680522884
Epoch 11/20, Loss: 0.05078161393935608
Epoch 12/20, Loss: 0.04370065226261513
Epoch 13/20, Loss: 0.04167460285399634
Epoch 14/20, Loss: 0.037786185886203305
Epoch 15/20, Loss: 0.0410004809038935
Epoch 16/20, Loss: 0.03723338383469556
Epoch 17/20, Loss: 0.03654667219455097
Epoch 18/20, Loss: 0.03642187906311162
Epoch 19/20, Loss: 0.03424999376072288
Epoch 20/20, Loss: 0.03174746874104748
              precision    recall  f1-score   support

           0       0.54      0.65      0.59      1019
           1       0.22      0.15      0.18       478
           2       0.23      0.24      0.23       472
           3    

Испробовав различные варианты с TfIdfVectorizer приходим к выводу, что это не лучший способ обработки текстов отзывов, ведь даже сложные последовательные модели или стекинги не могут добиться лучшей точности при заметном дисбалансе классов. Каждая модель имеет хорошую точность только на низшей и высшей оценке, а лучше всех с этой задачей предсказания рейтинг справился VotingClassifier: 

In [25]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
import numpy as np

# Определяем модели с использованием GPU для CatBoost
catboost_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    eval_metric='MultiClass',
    task_type='GPU',  # Использование GPU
    silent=True
)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Создаем pipeline для каждой модели
catboost_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('catboost', catboost_model)
])

rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('random_forest', rf_model)
])

lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logistic_regression', lr_model)
])

# Определяем, сколько элементов в каждом классе
class_counts = train_df['rating'].value_counts()

# Устанавливаем целевое количество для классов с количеством меньше 5000
target_count = 5000
sampling_strategy = {label: target_count for label, count in class_counts.items() if count < target_count}

# Создаем oversampler с определенной стратегией
oversampler = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)

# Применение oversampling к обучающему набору
X_resampled, y_resampled = oversampler.fit_resample(train_df[['review']], train_df['rating'])

# Обучение ансамбля на увеличенном наборе данных
voting_clf.fit(X_resampled['review'], y_resampled)

# Предсказание на тестовом наборе
y_pred = voting_clf.predict(test_df['review'])

# Оценка производительности
print(classification_report(test_df['rating'], y_pred))

              precision    recall  f1-score   support

           1       0.57      0.74      0.64      5022
           2       0.23      0.13      0.16      2302
           3       0.28      0.18      0.22      2541
           4       0.32      0.34      0.33      2635
           7       0.30      0.30      0.30      2307
           8       0.26      0.23      0.25      2850
           9       0.24      0.14      0.17      2344
          10       0.51      0.68      0.58      4999

    accuracy                           0.42     25000
   macro avg       0.34      0.34      0.33     25000
weighted avg       0.38      0.42      0.39     25000



In [None]:
import joblib

# Сохранение модели голосующего классификатора
joblib.dump(voting_clf, 'voting_classifier_model.joblib')

# Сохранение TF-IDF векторизатора
# joblib.dump(tfidf_vectorizer, 'voting_tfidf_vectorizer.joblib')