In [1]:
#импорт
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
##загрузка данных
def load_bbc_data(data_path='bbc'):
    classes = ['business', 'entertainment', 'politics', 'sport', 'tech']
    texts, labels = [], []

    for label in classes:
        class_dir = os.path.join(data_path, label)
        for file in os.listdir(class_dir):
            with open(os.path.join(class_dir, file), 'r', encoding='latin-1') as f:
                texts.append(f.read())
                labels.append(label)

    return pd.DataFrame({'text': texts, 'label': labels})

def load_additional_data(data_path='bbc_additional'):
    classes = ['business', 'entertainment', 'politics', 'sport', 'tech']
    texts, labels, filenames = [], [], []

    for label in classes:
        class_dir = os.path.join(data_path, label)
        for file in os.listdir(class_dir):
            file_path = os.path.join(class_dir, file)
            with open(file_path, 'r', encoding='latin-1') as f:
                    texts.append(f.read())
                    labels.append(label)
                    filenames.append(file)
    return pd.DataFrame({'filename': filenames, 'text': texts, 'true_label': labels})

In [3]:
class ClassicNewsClassifier:
    def __init__(self):
        self.pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=10000,
                stop_words='english',
                ngram_range=(1,2))),
            ('clf', LogisticRegression(
                multi_class='multinomial',
                solver='lbfgs',
                max_iter=10000,
                random_state=42))
        ])

    def train(self, data, test_size=0.2):
        X_train, X_test, y_train, y_test = train_test_split(
            data['text'], data['label'],
            test_size=test_size,
            random_state=42
        )

        self.pipeline.fit(X_train, y_train)

        # Оценка
        preds = self.pipeline.predict(X_test)
        return f"Accuracy: {accuracy_score(y_test, preds):.3f}",classification_report(y_test, preds)

    def predict(self, texts):
        probs = self.pipeline.predict_proba(texts)
        preds = self.pipeline.classes_[probs.argmax(axis=1)]
        return list(zip(preds, probs.max(axis=1)))

    def save(self, path='classic_model.joblib'):
        joblib.dump(self.pipeline, path)
    def load(self, path='classic_model.joblib'):
        self.pipeline = joblib.load(path)    

In [4]:
if __name__ == "__main__":
    model_path = 'classic_model.joblib'

    print("Обучаем новую модель...")
    df = load_bbc_data()
    classifier = ClassicNewsClassifier()
    acc,report=classifier.train(df)
    print(acc)
    print(report)
    classifier.save()
    
    # предсказания на дополнительных данных
    print("\nОбрабатываем bbc_additional...")
    df_additional = load_additional_data()
    df_additional['predicted_label'], df_additional['confidence'] = zip(*classifier.predict(df_additional['text']))

    
    print(df_additional[['filename', 'true_label', 'predicted_label', 'confidence']])

    
    accuracy = (df_additional['true_label'] == df_additional['predicted_label']).mean()
    print(f"\nAccuracy on additional dataset: {accuracy:.3f}")

    
    df_additional.to_csv('bbc_additional_predictions.csv', index=False)
    print("Предсказания сохранены в 'bbc_additional_predictions.csv'.")

Обучаем новую модель...




Accuracy: 0.975
               precision    recall  f1-score   support

     business       0.97      0.96      0.97       110
entertainment       0.99      0.99      0.99        70
     politics       0.96      0.96      0.96        82
        sport       0.99      1.00      0.99        94
         tech       0.97      0.97      0.97        89

     accuracy                           0.98       445
    macro avg       0.98      0.98      0.98       445
 weighted avg       0.98      0.98      0.98       445


Обрабатываем bbc_additional...
   filename     true_label predicted_label  confidence
0     1.txt       business        business    0.476177
1     2.txt       business        business    0.679378
2     3.txt       business        business    0.510757
3     4.txt       business        business    0.604171
4     5.txt       business        business    0.430783
5     1.txt  entertainment   entertainment    0.548307
6     2.txt  entertainment   entertainment    0.366793
7     3.txt  e

In [9]:
!pip install transformers datasets accelerate>=0.26.0
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124



Looking in indexes: https://download.pytorch.org/whl/cu124




In [10]:
import torch; print(torch. cuda. is_available())

True


In [12]:
################################################################################################################
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from datasets import Dataset
import pandas as pd
import os

In [14]:
class BertNewsClassifier:
    def __init__(self):
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

        
        self.label2id = {'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}
        self.id2label = {v: k for k, v in self.label2id.items()}

        self.model = None

    def load_data(self, data_path='bbc'):
        # Загрузка 
        df = self._load_bbc_data(data_path)
        return Dataset.from_pandas(df).train_test_split(test_size=0.2, seed=42)

    def _load_bbc_data(self, path):
        
        classes = ['business', 'entertainment', 'politics', 'sport', 'tech']
        texts, labels = [], []

        for cls in classes:
            cls_dir = os.path.join(path, cls)
            for file in os.listdir(cls_dir):
                with open(os.path.join(cls_dir, file), 'r', encoding='latin-1') as f:
                    texts.append(f.read())
                    labels.append(cls)

        return pd.DataFrame({'text': texts, 'label': labels})

    def tokenize(self, examples):
        
        tokenized = self.tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        tokenized['labels'] = [self.label2id[label] for label in examples['label']]
        return tokenized

    def train(self, data):
        
        dataset = data.map(self.tokenize, batched=True)
        dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

        # init
        self.model = DistilBertForSequenceClassification.from_pretrained(
            'distilbert-base-uncased',
            num_labels=5,
            id2label=self.id2label,
            label2id=self.label2id
        ).to(self.device)  # Перемещение модели на устройство

        # parameters
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=10,
            per_device_train_batch_size=16,
            evaluation_strategy='epoch',
            logging_dir='./logs',
            learning_rate=2e-5,
            save_strategy='no'
        )

         
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=dataset['train'],
            eval_dataset=dataset['test'],
            compute_metrics=lambda p: {
                'accuracy': (p.predictions.argmax(-1) == p.label_ids).mean()
            }
        )

        # обучение
        trainer.train()

        # эвал
        self.model.eval()
        print(f"Validation Accuracy: {trainer.evaluate()['eval_accuracy']:.3f}")

    def predict(self, texts):
         
        self.model.to(self.device)
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(self.device)  

        with torch.no_grad():
            outputs = self.model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        preds = probs.argmax(dim=1)
        return [
            (self.id2label[p.item()], probs[i][p].item())  # Исправлены скобки
            for i, p in enumerate(preds)
        ]

    def save(self, path='bert_model'):
        # сохранение
        self.model.save_pretrained(path)
        self.tokenizer.save_pretrained(path)

In [15]:

if __name__ == "__main__":
    classifier = BertNewsClassifier()
    data_split = classifier.load_data()

    # обучение
    classifier.train(data_split)

    # тестовый прогноз
    sample_texts = [
        "Apple announced a breakthrough in quantum computing",
        "Premier League transfers reached record spending this summer"
    ]
    print("\nPredictions:", classifier.predict(sample_texts))

    # сохранение модели
    classifier.save()

Map:   0%|          | 0/1776 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.118038,0.982022
2,No log,0.065232,0.986517
3,No log,0.06311,0.986517
4,No log,0.061242,0.986517
5,0.169800,0.061593,0.986517
6,0.169800,0.068858,0.986517
7,0.169800,0.061063,0.988764
8,0.169800,0.068188,0.98427
9,0.169800,0.066623,0.98427
10,0.004800,0.066502,0.98427


Validation Accuracy: 0.984

Predictions: [('tech', 0.9899190664291382), ('sport', 0.9932069182395935)]


In [38]:
import os
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.metrics import accuracy_score


MODEL_PATH = "bert_model"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH).to(device)
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_PATH)


def load_additional_data(data_path='bbc_additional'):
    classes = ['business', 'entertainment', 'politics', 'sport', 'tech']
    texts, labels, filenames = [], [], []

    for cls in classes:
        class_dir = os.path.join(data_path, cls)
        for file in os.listdir(class_dir):
            with open(os.path.join(class_dir, file), 'r', encoding='latin-1') as f:
                texts.append(f.read())
                labels.append(cls)
                filenames.append(file)

    return pd.DataFrame({'filename': filenames, 'text': texts, 'true_label': labels})


def predict(texts):
    inputs = tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    preds = probs.argmax(dim=1)
    id2label = {0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'}
    return [(id2label[p.item()], probs[i][p].item()) for i, p in enumerate(preds)]


if __name__ == "__main__":
    print("Обрабатываем bbc_additional...")
    df_additional = load_additional_data()
    
    df_additional['predicted_label'], df_additional['confidence'] = zip(*predict(df_additional['text']))
    
    
    accuracy = accuracy_score(df_additional['true_label'], df_additional['predicted_label'])
    
    
    print(df_additional[['filename', 'true_label', 'predicted_label', 'confidence']])
    print(f"\nточность дополнительного датасета: {accuracy:.3f}")
    
    
    df_additional.to_csv('bbc_additional_predictions.csv', index=False)
    print("Предсказания сохранены в 'bbc_additional_predictions.csv'.")


Обрабатываем bbc_additional...
   filename     true_label predicted_label  confidence
0     1.txt       business        business    0.999998
1     2.txt       business        business    0.999998
2     3.txt       business        business    0.999998
3     4.txt       business        business    0.999999
4     5.txt       business        politics    0.999985
5     1.txt  entertainment   entertainment    0.999998
6     2.txt  entertainment   entertainment    0.999998
7     3.txt  entertainment   entertainment    0.999998
8     4.txt  entertainment   entertainment    0.999998
9     5.txt  entertainment   entertainment    0.999998
10    1.txt       politics        politics    0.999999
11    2.txt       politics        politics    0.999999
12    3.txt       politics        politics    0.999998
13    4.txt       politics        politics    0.999999
14    5.txt       politics        politics    0.999999
15    1.txt          sport           sport    0.999999
16    2.txt          sport        