In [1]:
pip install datasets requests tqdm

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
pip install transformers tensorflow datasets nltk torch



In [7]:
!python train_script.py

2025-01-16 11:33:10.326902: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-16 11:33:10.344879: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-16 11:33:10.366008: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-16 11:33:10.372587: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-16 11:33:10.388031: I tensorflow/core/platform/cpu_feature_guar

In [15]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertModel
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from torch.utils.data import Dataset, DataLoader

class FastChineseStyleClassifier(nn.Module):
    def __init__(self, pretrained_model="bert-base-chinese"):
        super().__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)

        # Freeze most of BERT
        for param in self.bert.parameters():
            param.requires_grad = False

        # Only unfreeze the last 2 layers
        for param in self.bert.encoder.layer[-2:].parameters():
            param.requires_grad = True

        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return self.classifier(outputs[1])

class EvalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.float)
        }

def load_test_data(data_dir=".", split="test", max_samples=2000):
    """Load test data from files with optional limit"""
    with open(os.path.join(data_dir, f"{split}.src"), 'r', encoding='utf-8') as f:
        classical_texts = [line.strip().replace(" ", "") for line in f.readlines()]
    with open(os.path.join(data_dir, f"{split}.tgt"), 'r', encoding='utf-8') as f:
        modern_texts = [line.strip().replace(" ", "") for line in f.readlines()]

    # Limit samples if specified
    if max_samples:
        classical_texts = classical_texts[:max_samples//2]
        modern_texts = modern_texts[:max_samples//2]

    return classical_texts, modern_texts

def evaluate_model_batch(model, dataloader, device):
    """Evaluate model using batch processing"""
    model.eval()
    all_preds = []
    all_labels = []
    all_confidences = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label']

            outputs = model(input_ids, attention_mask)
            if hasattr(outputs, 'logits'):
                probs = torch.softmax(outputs.logits, dim=1)
                preds = (probs[:, 1] >= 0.5).float()
                confs = probs[:, 1]
            else:
                preds = (outputs.squeeze() >= 0.5).float()
                confs = outputs.squeeze()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
            all_confidences.extend(confs.cpu().numpy())

    return np.array(all_preds), np.array(all_labels), np.array(all_confidences)

def plot_comparison_metrics(custom_report, roberta_report):
    """Plot side-by-side comparison of metrics"""
    metrics = ['Accuracy', 'Classical F1', 'Modern F1']
    custom_scores = [
        custom_report['accuracy'],
        custom_report['Classical']['f1-score'],
        custom_report['Modern']['f1-score']
    ]
    roberta_scores = [
        roberta_report['accuracy'],
        roberta_report['Classical']['f1-score'],
        roberta_report['Modern']['f1-score']
    ]

    x = np.arange(len(metrics))
    width = 0.35

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(x - width/2, custom_scores, width, label='Custom Model')
    ax.bar(x + width/2, roberta_scores, width, label='RoBERTa')

    ax.set_ylabel('Scores')
    ax.set_title('Model Performance Comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(metrics)
    ax.legend()

    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.close()

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load test data (with sample limit for faster processing)
    classical_texts, modern_texts = load_test_data(max_samples=2000)
    texts = classical_texts + modern_texts
    labels = [1] * len(classical_texts) + [0] * len(modern_texts)
    print(f"Evaluating on {len(texts)} examples")

    # Initialize RoBERTa
    print("Loading RoBERTa model...")
    roberta_tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large")
    roberta_model = AutoModelForSequenceClassification.from_pretrained(
        "hfl/chinese-roberta-wwm-ext-large",
        num_labels=2
    ).to(device)

    # Initialize custom model
    print("Loading custom model...")
    custom_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    custom_model = FastChineseStyleClassifier().to(device)
    custom_model.load_state_dict(torch.load('best_chinese_classifier.pt', weights_only=True))

    # Create dataloaders
    batch_size = 32
    roberta_dataset = EvalDataset(texts, labels, roberta_tokenizer)
    custom_dataset = EvalDataset(texts, labels, custom_tokenizer)

    roberta_loader = DataLoader(roberta_dataset, batch_size=batch_size)
    custom_loader = DataLoader(custom_dataset, batch_size=batch_size)

    # Evaluate both models
    print("\nEvaluating RoBERTa model...")
    roberta_preds, roberta_labels, roberta_conf = evaluate_model_batch(
        roberta_model, roberta_loader, device)

    print("\nEvaluating custom model...")
    custom_preds, custom_labels, custom_conf = evaluate_model_batch(
        custom_model, custom_loader, device)

    # Calculate metrics
    custom_report = classification_report(custom_labels, custom_preds,
                                       output_dict=True,
                                       target_names=['Modern', 'Classical'],
                                       zero_division=0)
    roberta_report = classification_report(roberta_labels, roberta_preds,
                                         output_dict=True,
                                         target_names=['Modern', 'Classical'],
                                         zero_division=0)

    # Create comparison table
    comparison_df = pd.DataFrame({
        'Metric': ['Accuracy', 'Classical F1', 'Modern F1', 'Macro Avg F1'],
        'Custom Model': [
            custom_report['accuracy'],
            custom_report['Classical']['f1-score'],
            custom_report['Modern']['f1-score'],
            custom_report['macro avg']['f1-score']
        ],
        'RoBERTa Model': [
            roberta_report['accuracy'],
            roberta_report['Classical']['f1-score'],
            roberta_report['Modern']['f1-score'],
            roberta_report['macro avg']['f1-score']
        ]
    })

    # Save results
    comparison_df.to_csv('classifier_comparison_results.csv', index=False)
    print("\nClassifier Comparison Results:")
    print(comparison_df.to_string(index=False))

    # Generate visualization
    plot_comparison_metrics(custom_report, roberta_report)

    # Also save confusion matrices
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    sns.heatmap(confusion_matrix(custom_labels, custom_preds),
                annot=True, fmt='d', cmap='Blues')
    plt.title('Custom Model Confusion Matrix')

    plt.subplot(1, 2, 2)
    sns.heatmap(confusion_matrix(roberta_labels, roberta_preds),
                annot=True, fmt='d', cmap='Blues')
    plt.title('RoBERTa Model Confusion Matrix')

    plt.tight_layout()
    plt.savefig('confusion_matrices.png')
    plt.close()

if __name__ == "__main__":
    main()

Using device: cuda
Evaluating on 2000 examples
Loading RoBERTa model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading custom model...

Evaluating RoBERTa model...


Evaluating: 100%|██████████| 63/63 [00:09<00:00,  6.31it/s]



Evaluating custom model...


Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.29it/s]



Classifier Comparison Results:
      Metric  Custom Model  RoBERTa Model
    Accuracy      0.986000       0.500000
Classical F1      0.986097       0.000000
   Modern F1      0.985901       0.666667
Macro Avg F1      0.985999       0.333333


In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Pipeline
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import numpy as np
from tqdm import tqdm
import math
import os

def load_test_data(data_dir=".", split="test", max_samples=1000):
    """Load test data from files"""
    with open(os.path.join(data_dir, f"{split}.src"), 'r', encoding='utf-8') as f:
        classical_texts = [line.strip() for line in f.readlines()]

    if max_samples:
        classical_texts = classical_texts[:max_samples]

    return classical_texts

def generate_reference_translations(texts, device):
    """Generate reference translations using a different model"""
    # Using Facebook's M2M100 model as reference
    print("\nGenerating reference translations...")
    tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_418M").to(device)

    tokenizer.src_lang = "zh"
    tokenizer.tgt_lang = "en"

    references = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id("en"))
        translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        references.append(translation)

    return references

def calculate_bleu(references, hypotheses):
    """Calculate BLEU score"""
    smoother = SmoothingFunction()
    refs = [[ref.split()] for ref in references]
    hyps = [hyp.split() for hyp in hypotheses]
    return corpus_bleu(refs, hyps, smoothing_function=smoother.method1) * 100

def calculate_perplexity(model, tokenizer, texts, device):
    """Calculate perplexity score"""
    model.eval()
    total_loss = 0
    total_length = 0

    with torch.no_grad():
        for text in tqdm(texts, desc="Calculating perplexity"):
            inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)
            outputs = model(**inputs, labels=inputs["input_ids"])
            total_loss += outputs.loss.item() * inputs["input_ids"].size(1)
            total_length += inputs["input_ids"].size(1)

    return math.exp(total_loss / total_length)

def evaluate_translations(texts, device):
    """Evaluate translations using multiple models and metrics"""
    # Initialize Helsinki model
    helsinki_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
    helsinki_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en").to(device)

    # Generate reference translations
    reference_translations = generate_reference_translations(texts, device)

    # Generate Helsinki translations
    print("\nGenerating Helsinki translations...")
    helsinki_translations = []
    for text in tqdm(texts):
        inputs = helsinki_tokenizer(text, return_tensors="pt").to(device)
        outputs = helsinki_model.generate(**inputs)
        trans = helsinki_tokenizer.decode(outputs[0], skip_special_tokens=True)
        helsinki_translations.append(trans)

    # Generate pipeline translations
    print("\nGenerating pipeline translations...")
    pipeline_translations = []
    for text in tqdm(texts):
        # First to modern Chinese, then to English
        inputs = helsinki_tokenizer(text, return_tensors="pt").to(device)
        outputs = helsinki_model.generate(**inputs, num_beams=5, max_length=128)
        trans = helsinki_tokenizer.decode(outputs[0], skip_special_tokens=True)
        pipeline_translations.append(trans)

    # Calculate BLEU scores
    helsinki_bleu = calculate_bleu(reference_translations, helsinki_translations)
    pipeline_bleu = calculate_bleu(reference_translations, pipeline_translations)

    # Calculate perplexity
    helsinki_ppl = calculate_perplexity(helsinki_model, helsinki_tokenizer, helsinki_translations, device)
    pipeline_ppl = calculate_perplexity(helsinki_model, helsinki_tokenizer, pipeline_translations, device)

    # Print results
    print("\nTranslation Quality Metrics:")
    print(f"{'Metric':<20} {'Helsinki':<12} {'Our Pipeline':<12}")
    print("-" * 44)
    print(f"{'BLEU Score':<20} {helsinki_bleu:>11.2f} {pipeline_bleu:>11.2f}")
    print(f"{'Perplexity':<20} {helsinki_ppl:>11.2f} {pipeline_ppl:>11.2f}")

    # Show example translations
    print("\nExample Translations:")
    for i in range(min(5, len(texts))):
        print(f"\nSource: {texts[i]}")
        print(f"Reference: {reference_translations[i]}")
        print(f"Helsinki: {helsinki_translations[i]}")
        print(f"Our Pipeline: {pipeline_translations[i]}")

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load test data
    classical_texts = load_test_data()
    print(f"Loaded {len(classical_texts)} test examples")

    # Run evaluation
    evaluate_translations(classical_texts, device)

if __name__ == "__main__":
    main()

Using device: cuda
Loaded 1000 test examples

Generating reference translations...


tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

100%|██████████| 1000/1000 [04:00<00:00,  4.16it/s]



Generating Helsinki translations...


100%|██████████| 1000/1000 [03:05<00:00,  5.40it/s]



Generating pipeline translations...


100%|██████████| 1000/1000 [02:43<00:00,  6.10it/s]
Calculating perplexity: 100%|██████████| 1000/1000 [00:12<00:00, 77.82it/s]
Calculating perplexity: 100%|██████████| 1000/1000 [00:12<00:00, 78.04it/s]



Translation Quality Metrics:
Metric               Helsinki     Our Pipeline
--------------------------------------------
BLEU Score                  3.77        4.28
Perplexity                 11.69       11.86

Example Translations:

Source: 范 仲 淹 二 岁 而 孤 ，
Reference: Two years old and lonely.
Helsinki: Van inundated two years old and alone.
Our Pipeline: Van inundated two years old and alone.

Source: 范 仲 淹 二 岁 而 孤 ， 母 贫 无 靠 ，
Reference: He was two years old and alone, and his mother was poor and unreliable.
Helsinki: Van drowned two years old, and was alone, and the mother was destitute.
Our Pipeline: Van drowned two years old, and was alone, and the mother was destitute.

Source: 范 仲 淹 二 岁 而 孤 ， 母 贫 无 靠 ， 再 适 常 山 朱 氏 。
Reference: He is two years old and alone, and his mother is poor, and he is always suitable for Mountains.
Helsinki: Van drowned two years old, and was alone, and the mother was poor and helpless.
Our Pipeline: Van drowned two years old, and was alone, and the mothe