In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import TextClassificationPipeline
import torch

In [11]:
# 读取数据
df = pd.read_csv('/Users/henry/Downloads/output.csv')
df_marked = df.dropna(subset=['label'])
df_marked.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1061 entries, 0 to 19526
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    1061 non-null   object
 1   label   1061 non-null   object
dtypes: object(2)
memory usage: 24.9+ KB


In [12]:
df_marked['label'].value_counts()

label
sentence       353
meaningless    289
url            128
greeting       110
log             91
shell           54
image_tag       36
Name: count, dtype: int64

In [13]:
# 标签映射
label_mapping = {'sentence': 0, 'meaningless': 1, 'url': 2, 'greeting': 3, 'log': 4, 'shell': 5, 'image_tag': 6}

# 将标签转换为整数
df_marked['label'] = df_marked['label'].map(label_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_marked['label'] = df_marked['label'].map(label_mapping)


In [14]:

# 分割数据集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_marked['Text'].tolist(), df_marked['label'].tolist(), test_size=0.2)

# 使用BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize数据
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# 转换成PyTorch数据格式
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

# 使用BERT进行分类
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df_marked['label'].unique()))

# 训练参数
training_args = TrainingArguments(
    output_dir='./models',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# 模型训练
trainer.train()

# 保存模型
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/318 [06:10<?, ?it/s]
                                                 
100%|██████████| 318/318 [07:10<00:00,  1.35s/it]


{'train_runtime': 430.4451, 'train_samples_per_second': 5.91, 'train_steps_per_second': 0.739, 'train_loss': 0.8624471028645834, 'epoch': 3.0}


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

In [17]:
from transformers import pipeline

# 加载训练好的模型
model = BertForSequenceClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')

# 创建分类pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, truncation=True, max_length=512)

def split_text_column_to_sentences(df, column_name):
    # 初始化一个列表来存储所有句子
    all_sentences = []
    # 遍历指定列的每一行
    for text in df[column_name]:
        # 按空行分隔文本
        formated_text = text.replace('\r\n', '\n').replace('\r', '\n')
        sentences = [sentence.strip().strip('\n') for sentence in formated_text.split('\n\n') if sentence.strip()]
        all_sentences.extend(sentences)  # 将句子添加到列表

    return all_sentences

def label_text(data, column_name):
    labeled_data = data.copy()
    for idx, row in labeled_data.iterrows():
        sentence = row[column_name]
        result = classifier(sentence)
        labeled_data.at[idx, 'label'] = result
    return labeled_data

df_unmarked = df[df['label'].isna() == True]
df_labeled = label_text(df_unmarked, 'Text')

df_unmarked.update(df_labeled)
df_unmarked.to_csv('/Users/henry/Downloads/result.csv')
