In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import label_binarize
from transformers import BertTokenizer, BertForMaskedLM, pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
from nltk.corpus import stopwords
import re
from imblearn.over_sampling import SMOTE
import random
import torch
from transformers import MarianMTModel, MarianTokenizer
from concurrent.futures import ThreadPoolExecutor
import sentencepiece


In [3]:
data = pd.read_csv('.dataset.csv')
data.drop_duplicates(subset='uu_usl_name', keep='first'
                                                '', inplace=True)
data.drop('SERVICE_CLASS_CONFIRMED', axis=1, inplace=True)
data.tail()

FileNotFoundError: [Errno 2] No such file or directory: '.dataset.csv'

In [4]:

model_name_ru_to_en = 'Helsinki-NLP/opus-mt-ru-en'
model_name_en_to_ru = 'Helsinki-NLP/opus-mt-en-ru'

tokenizer_ru_to_en = MarianTokenizer.from_pretrained(model_name_ru_to_en)
model_ru_to_en = MarianMTModel.from_pretrained(model_name_ru_to_en)

tokenizer_en_to_ru = MarianTokenizer.from_pretrained(model_name_en_to_ru)
model_en_to_ru = MarianMTModel.from_pretrained(model_name_en_to_ru)

# Функция для перевода текста
def translate(text, model, tokenizer, target_language="en"):
    tokenized_text = tokenizer.prepare_seq2seq_batch([text], return_tensors='pt')
    translated = model.generate(**tokenized_text)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

# Функция для back-translation
def back_translate(text, source_language="ru", target_language="en"):
    # Переводим с русского на английский
    translated_to_en = translate(text, model_ru_to_en, tokenizer_ru_to_en, target_language)
    # Переводим с английского обратно на русский
    back_translated_to_ru = translate(translated_to_en, model_en_to_ru, tokenizer_en_to_ru, source_language)
    return back_translated_to_ru

# Кэширование переводов
translation_cache = {}

# Функция для аугментации с использованием back-translation
def augment_text(text):
    # Если текст уже переведен, используем кэшированный перевод
    if text in translation_cache:
        return translation_cache[text]
    
    # Иначе, выполняем back-translation
    back_translated_text = back_translate(text)
    translation_cache[text] = back_translated_text
    return back_translated_text

# Выборочная аугментация и многопоточность
def selective_augmentation(texts, augmentation_probability=0.1, num_threads=10):
    augmented_texts = []
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for text in texts:
            if random.random() < augmentation_probability:
                futures.append(executor.submit(augment_text, text))
            else:
                augmented_texts.append(text)
        
        # Дождаться завершения всех операций перевода
        for future in futures:
            augmented_texts.append(future.result())
    
    return augmented_texts

# Пример использования
# texts_to_augment = ["Пример текста для аугментации", "Еще один пример текста"]
# augmented_texts = selective_augmentation(texts_to_augment)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [11]:
# to gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [10]:
translate('Пример текста для аугментации', model_ru_to_en, tokenizer_ru_to_en, target_language="en")

'Example text for augmentation'

In [5]:
class_counts = data['CLASS_ID'].value_counts()
minor_classes = class_counts[class_counts < 10].index

NameError: name 'data' is not defined

In [6]:
# Создайте новый DataFrame с аугментированными данными
augmented_data = []
for class_id in minor_classes:
    class_data = data[data['CLASS_ID'] == class_id]

    # Собираем тексты для аугментации
    texts_to_augment = class_data['uu_usl_name'].tolist()

    # Применяем selective_augmentation
    augmented_texts = selective_augmentation(texts_to_augment)

    # Добавляем аугментированные тексты в augmented_data
    for augmented_text in augmented_texts:
        augmented_data.append({'CLASS_ID': class_id, 'uu_usl_name': augmented_text})

# Создайте новый DataFrame с аугментированными данными
new_data = pd.DataFrame(augmented_data)

# Объедините оригинальный DataFrame и новый DataFrame с аугментированными данными
final_data = pd.concat([data, new_data])

NameError: name 'minor_classes' is not defined

In [None]:
class_counts = final_data['CLASS_ID'].value_counts()
minor_classes_count = class_counts.min()
minor_classes_count

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^а-яА-Яa-zA-Z0-9]", " ", text)  # Удаляем все символы, кроме букв и цифр
    words = text.split()
    words = [w for w in words if w not in stop_words]  # Удаляем стоп-слова
    text = " ".join(words)
    return text
# Предполагается, что текстовый столбец называется 'text'
final_data['uu_usl_name'] = final_data['uu_usl_name'].apply(preprocess_text)


In [None]:
X = final_data['uu_usl_name']
y = final_data["CLASS_ID"]

X_train, X_test, y_train, y_test = train_test_split(final_data['uu_usl_name'], final_data["CLASS_ID"], test_size=0.2, random_state=42)

# Препроцессинг
X_train_preprocessed = [preprocess_text(text) for text in X_train]
X_test_preprocessed = [preprocess_text(text) for text in X_test]

# Векторизация
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vectors = vectorizer.fit_transform(X_train_preprocessed)
X_test_vectors = vectorizer.transform(X_test_preprocessed)

# Создаем объект SMOTE
smote = SMOTE(sampling_strategy='auto', k_neighbors=min(minor_classes_count - 1, 5), random_state=42)

# Применяем SMOTE для балансировки классов
X_resampled, y_resampled = smote.fit_resample(X_train_vectors, y_train)





In [None]:
# Обучение модели
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_resampled_vectors, y_resampled)
y_pred = classifier.predict(vectorizer.transform(X_test))

print(classification_report(y_test, y_pred))

# # Обучение модели
# classifier = RandomForestClassifier(random_state=42, oob_score=True)
# # Обучение модели
# classifier.fit(X_train, y_train)

# # Оценка качества модели с использованием out-of-bag score
# oob_score = classifier.oob_score_


In [None]:
# Предсказание и оценка
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, fmt="d")
plt.show()
# y_pred = classifier.predict(vectorizer.transform(X_test))
# print(f"Out-of-Bag Score (оценка модели): {oob_score}")