In [None]:

# Install dependencies
!pip install -q torch transformers scikit-learn pandas matplotlib seaborn lxml tqdm
!pip install --upgrade transformers

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xml.etree.ElementTree as ET
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

def parse_articles(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    rows = []
    for article in root.findall('article'):
        article_id = article.attrib.get('id')
        title = article.attrib.get('title', '')
        published_at = article.attrib.get('published-at', '')
        text = ' '.join([elem.text.strip() for elem in article.iter() if elem.text and elem is not article])
        rows.append({'id': article_id, 'title': title, 'published_at': published_at, 'text': text.strip()})
    return pd.DataFrame(rows)

def parse_labels(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    rows = []
    for article in root.findall('article'):
        label = 1 if article.attrib.get('hyperpartisan') == "true" else 0
        rows.append({'id': article.attrib['id'], 'label': label})
    return pd.DataFrame(rows)

articles = parse_articles('articles-training-byarticle-20181122.xml')
labels = parse_labels('ground-truth-training-byarticle-20181122.xml')
df = pd.merge(articles, labels, on='id')

def clean_text(text):
    return text.strip().lower()

df['text_clean'] = df['text'].apply(clean_text)

X_train, X_test, y_train, y_test = train_test_split(
    df['text_clean'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 256

def encode_texts(texts):
    return tokenizer(
        list(texts),
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

train_enc = encode_texts(X_train)
test_enc  = encode_texts(X_test)

y_train_tensor = torch.tensor(list(y_train), dtype=torch.long)
y_test_tensor  = torch.tensor(list(y_test), dtype=torch.long)

from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_enc, list(y_train))
test_dataset  = NewsDataset(test_enc,  list(y_test))

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs'
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=-1)
    return {
        'accuracy': (preds == labels).mean(),
        'roc_auc': roc_auc_score(labels, pred.predictions[:,1])
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()

outputs = trainer.predict(test_dataset)
y_true = outputs.label_ids
y_scores = outputs.predictions[:, 1]
y_pred = np.argmax(outputs.predictions, axis=-1)

print(classification_report(y_true, y_pred, digits=4))
