In [4]:
from datasets import Dataset, DatasetDict

# 加载每个 split 的 parquet 文件为 Dataset
train_ds = Dataset.from_parquet("data/datasets/rotten_tomatoes/train.parquet")
val_ds = Dataset.from_parquet("data/datasets/rotten_tomatoes/validation.parquet")
test_ds = Dataset.from_parquet("data/datasets/rotten_tomatoes/test.parquet")

# 创建 DatasetDict
data = DatasetDict({
    'train': train_ds,
    'validation': val_ds,
    'test': test_ds
})

In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [6]:
data["train"][0,-1]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'things really get weird , though not particularly scary : the movie is all portent and no content .'],
 'label': [1, 0]}

In [None]:
# 加载模型
from transformers import pipeline
model_path = "/data/models/cardiffnlp-twitter-roberta-base-sentiment-latest"
pipe = pipeline(
    task="sentiment-analysis",
    model=model_path,
    tokenizer=model_path,
    return_all_scores=True,
    device="cpu"
)

In [9]:
# 在测试集上使用模型
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
# 运行推理
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"],"text")), total=len(data["test"])):
    negative_score = output[0]["score"]
    positive_score = output[2]["score"]
    assignment = np.argmax([negative_score, positive_score])
    y_pred.append(assignment)

Disabling tokenizer parallelism, we're using DataLoader multithreading already
100%|███████████████████████████████████████████████████████████████████████████████████████████| 1066/1066 [01:55<00:00,  9.24it/s]


In [10]:
# 评估函数
from sklearn.metrics import classification_report
def evaluate_performance(y_true, y_pred):
    performance = classification_report(
        y_true, y_pred,
        target_names=["Negative Review", "Positive Review"]
    )
    print(performance)

In [11]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.76      0.88      0.81       533
Positive Review       0.86      0.72      0.78       533

       accuracy                           0.80      1066
      macro avg       0.81      0.80      0.80      1066
   weighted avg       0.81      0.80      0.80      1066



In [13]:
# 使用预训练的嵌入模型来创建嵌入向量
# pip install sentence-transformers
from sentence_transformers import SentenceTransformer

In [14]:
model = SentenceTransformer("/data/models/sentence-transformers-all-mpnet-base-v2")
# 将文本转换为嵌入向量
train_embeddings = model.encode(data["train"]["text"], show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"], show_progress_bar=True)

loading configuration file /data/models/sentence-transformers-all-mpnet-base-v2\config.json
Model config MPNetConfig {
  "_name_or_path": "/data/models/sentence-transformers-all-mpnet-base-v2",
  "architectures": [
    "MPNetForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "mpnet",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.37.2",
  "vocab_size": 30527
}

loading weights file /data/models/sentence-transformers-all-mpnet-base-v2\model.safetensors
All model checkpoint weights were used when initializing MPNetModel.

All the weights of MPNetModel were initialized from the model checkpoint at /data/models/sentence-transformers-all-m

Batches:   0%|          | 0/267 [00:00<?, ?it/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

In [15]:
train_embeddings.shape

(8530, 768)

In [16]:
import pickle

with open('r_data/rotten_tomatoes/train_embeddings.pkl', 'wb') as f:
    pickle.dump(train_embeddings, f)

with open('r_data/rotten_tomatoes/test_embeddings.pkl', 'wb') as f:
    pickle.dump(test_embeddings, f)

In [None]:
import pickle

with open('train_embeddings.pkl', 'rb') as f:
    train_embeddings_2 = pickle.load(f)

with open('test_embeddings.pkl', 'rb') as f:
    test_embeddings_2 = pickle.load(f)

In [17]:
from sklearn.linear_model import LogisticRegression
# 基于训练嵌入向量构建逻辑回归模型
clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])

In [18]:
# 预测未见过的样本，评估模型
y_pred = clf.predict(test_embeddings)
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.86      0.85       533
Positive Review       0.86      0.85      0.85       533

       accuracy                           0.85      1066
      macro avg       0.85      0.85      0.85      1066
   weighted avg       0.85      0.85      0.85      1066



In [19]:
# 未标注数据，为标签创建嵌入向量
label_embeddings = model.encode(["A negative review","A positive review"])

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
# 为每个文档找到最匹配的标签
sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

In [21]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.78      0.77      0.78       533
Positive Review       0.77      0.79      0.78       533

       accuracy                           0.78      1066
      macro avg       0.78      0.78      0.78      1066
   weighted avg       0.78      0.78      0.78      1066



In [22]:
pipe = pipeline(
    "text2text-generation",
    model="/data/models/google-flan-t5-small",
    device="cpu"
)

loading configuration file /data/models/google-flan-t5-small\config.json
Model config T5Config {
  "_name_or_path": "/data/models/google-flan-t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "

In [23]:
# 准备数据
prompt = "Is the following sentence positive or negative? "
data = data.map(lambda example: {"t5": prompt + example["text"]})
data

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
})

In [24]:
# 运行推理
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data["test"])):
    text = output[0]["generated_text"]
    y_pred.append(0 if text == "negative" else 1)

  
100%|███████████████████████████████████████████████████████████████████████████████████████████| 1066/1066 [02:09<00:00,  8.25it/s]


In [25]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.83      0.85      0.84       533
Positive Review       0.85      0.83      0.84       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066

