In [None]:
import pandas as pd
import json
from typing import List, Dict, Tuple
import torch
import numpy as np
from collections import Counter
import re

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install unsloth
# Get latest Unsloth
!pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
# !pip uninstall -y torch bitsandbytes triton
# !pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 \
#     --index-url https://download.pytorch.org/whl/cu118
# !pip install triton bitsandbytes


In [None]:
class KriKriTeacher:

  def __init__(self):
    from unsloth import FastLanguageModel
    print("Loading Llama-Krikri-8B-Instruct model...")
    self.model, self.tokenizer = FastLanguageModel.from_pretrained(model_name="ilsp/Llama-Krikri-8B-Instruct",max_seq_length=8192,load_in_4bit = True)

    from transformers import TextStreamer
    from unsloth.chat_templates import get_chat_template

    self.tokenizer = get_chat_template(
        self.tokenizer,
        chat_template = "llama-3.1",
    )
    FastLanguageModel.for_inference(self.model)

    self.prompt_template = """
    Αναλύεις μηνύματα και τα κατηγοριοποιείς ως BULLY ή NON_BULLY.

    Ένα μήνυμα θεωρείται cyberbullying (BULLY) όταν περιλαμβάνει:
    1) Άμεσες απειλές, επιθέσεις, προσβολές ή παρενόχληση
    2) Απόπειρες αποκλεισμού ή κοινωνικής απομόνωσης
    3) Πρόθεση να βλάψει ή να ταπεινώσει
    4) Διάδοση φημών ή ψευδών πληροφοριών
    5) Bodyshaming ή επιθέσεις με βάση την εμφάνιση
    6) Παρενόχληση με βάση την ταυτότητα (φυλή, θρησκεία, σεξουαλικότητα κ.λπ.)
    7) Επίμονη ανεπιθύμητη επαφή ή συμπεριφορά καταδίωξης
    8) Ενθάρρυνση αυτοτραυματισμού ή αυτοκτονίας
    9) Κοινοποίηση ντροπιαστικού περιεχομένου χωρίς συγκατάθεση
    10) Χρήση ύβρεων ή υποτιμητικής γλώσσας

    Εξέτασε το πλαίσιο, την πρόθεση και τη σοβαρότητα. Κάποια παιχνιδιάρικα πειράγματα μεταξύ φίλων μπορεί να μην αποτελούν διαδικτυακό εκφοβισμό.
    Θα απαντάς Ναι αν το μήνυμα θεωρείται cyberbullying ή Οχι αν το μήνυμα ΔΕΝ θεωρείται cyberbullying.

    Απάντησε ΜΟΝΟ με μία λέξη: Ναι ή Οχι

    Μήνυμα: "{text}"
    Απάντηση:"""

  def preprocess_text(self, text: str) -> str:
    """Preprocess texts for consistent handling"""
    if pd.isna(text) or text is None:
      return ""

    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    # Remove @USER mentions
    text = re.sub(r'@USER\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'@\w+', '[MENTION]', text)

    text = re.sub(r'[!]{3,}', '!!!', text)
    text = re.sub(r'[?]{3,}', '???', text)
    text = re.sub(r'[.]{3,}', '...', text)

    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'\s+', ' ', text)

    if len(text.strip()) < 3:
      return ""

    return text.strip()

  def classify_single_message(self, text: str, temperature: float = 0.0) -> str:
    """Classify a single message using KriKri"""
    text = self.preprocess_text(text)

    # Format the prompt
    formatted_prompt = self.prompt_template.format(text=text)

    # Create chat format
    messages = [{"role": "system", "content": "Είσαι ειδικός στον εντοπισμό του cyberbullying στις ψηφιακές συνομιλίες. "},
     {"role": "user", "content": formatted_prompt}]
    input_text = self.tokenizer.apply_chat_template(messages, tokenize=False)

    # Tokenize and generate
    inputs = self.tokenizer(input_text, return_tensors="pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
      outputs = self.model.generate(
          **inputs,
          max_new_tokens=4,  #  Nai h Oxi
          temperature=temperature,
          do_sample=temperature > 0,
          pad_token_id= self.tokenizer.eos_token_id,
          repetition_penalty=1.1
          )

    # Decode response
    response = self.tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()


  def normalize_greek_response(self, response: str) -> str:
    normalized = response.strip().lower()
    nai_variations = ['ναι', 'nai']
    oxi_variations = ['όχι', 'οχι', 'oxι', 'oxi', 'ochi']
    if any(var in normalized for var in nai_variations):
      return 'BULLY'
    elif any(var in normalized for var in oxi_variations):
      return 'NON_BULLY'
    else:
          first_word = normalized.split()[0] if normalized.split() else ""
          if any(first_word.startswith(var[:2]) for var in nai_variations):
            return 'BULLY'
          elif any(first_word.startswith(var[:2]) for var in oxi_variations):
            return 'NON_BULLY'
          else:
            print(f"Warning: Unexpected response format: '{response}'")
            return 'NON_BULLY'  # Conservative default



  def calculate_soft_labels(self, text: str, n_samples: int = 7) -> tuple[list[float], float]:
    """
    Return soft probabilities [p_bully, p_non_bully] and confidence score.
    Uses self-consistency with multiple samples.
    """

    predictions = []
    raw_responses = []

    for _ in range(n_samples):
      raw_pred = self.classify_single_message(text, temperature=0.2)
      normalized_pred = self.normalize_greek_response(raw_pred)
      predictions.append(normalized_pred)
      raw_responses.append(raw_pred)

    pred_counts = Counter(predictions)

    total = len(predictions)
    prob_bully = pred_counts.get('BULLY', 0) / total
    prob_non_bully = pred_counts.get('NON_BULLY', 0) / total

    if (prob_bully + prob_non_bully == 0):
      # If no valid predictions, default to non-bullying
      prob_bully, prob_non_bully = 0.1, 0.9
    elif prob_bully + prob_non_bully < 1.0:
      # If they sum to something less than 1 (e.g. .4 + .5 = .9), normalize
      total_valid = prob_bully + prob_non_bully
      if total_valid > 0:
        prob_bully /= total_valid
        prob_non_bully /= total_valid
      else:
        prob_bully, prob_non_bully = 0.1, 0.9

    majority = 'BULLY' if prob_bully > prob_non_bully else 'NON_BULLY'
    confidence = pred_counts.get(majority, 0) / total

    return [prob_bully, prob_non_bully], confidence


  def process_dataset(self, df: pd.DataFrame,text_column: str = 'text',label_column: str = 'label',confidence_threshold: float = 0.3) -> pd.DataFrame:
    """
    Process DataFrame of messages, returning filtered results with soft labels.
    """

    results = []

    print(f"Processing {len(df)} messages...")
    print(f"Mode: Using hand labels as ground truth")

    for idx, row in df.iterrows():
      text = row[text_column]
      hand_label = row[label_column] if label_column in df.columns else None

      processed_text = self.preprocess_text(text)
      if not processed_text:
        print(f"Skipping empty text at row {idx}")
        continue

      # Get soft labels and confidence from LLM
      soft_probs, confidence = self.calculate_soft_labels(processed_text)
      llm_hard_label = 1 if soft_probs[0] > soft_probs[1] else 0

      if hand_label is not None:
      # Use hand-annotated labels as ground truth
      # But keep LLM soft probabilities for knowledge distillation
        result = {
            'text': processed_text,
            'text_original': text,
            'label_hard': int(hand_label),  # Ground truth from annotation
            'p_teacher': soft_probs,        # Soft probs from LLM teacher
            'confidence': confidence,       # LLM confidence
            'llm_label': llm_hard_label,    # LLM prediction
            'agreement': int(hand_label) == llm_hard_label
            }
      else:
        result = ''

      results.append(result)

      if (idx + 1) % 10 == 0:
        print(f"Processed {idx + 1}/{len(df)} messages")

    result_df = pd.DataFrame(results)

    # Filter by confidence threshold
    print(f"Before filtering: {len(result_df)} samples")
    filtered_df = result_df[result_df['confidence'] >= confidence_threshold].copy()
    print(f"After confidence filtering (>= {confidence_threshold}): {len(filtered_df)} samples")

    return filtered_df


  def save_distillation_dataset(self, df: pd.DataFrame, output_path: str):
    """Save the processed dataset for knowledge distillation"""
    distillation_data = []
    for _, row in df.iterrows():
      distillation_data.append({
          'text': row['text'],
          'text_original': row['text_original'],
          'label_hard': int(row['label_hard']),
          'p_teacher': row['p_teacher'],
          'confidence': float(row['confidence']),
          'llm_label': int(row['llm_label']),           # LLM prediction
          'agreement': bool(row['agreement']),
          })

    try:
      from google.colab import drive
      import os

      if not os.path.exists('/content/drive'):
        print("Mounting Google Drive...")
        drive.mount('/content/drive')
      else:
        print("Google Drive already mounted")

      drive_dir = '/content/drive/MyDrive/cyshield'
      os.makedirs(drive_dir, exist_ok=True)

      json_path = f"{drive_dir}/{output_path}"
      csv_path = json_path.replace('.json', '.csv')

    except ImportError:
      print("Saving locally")
      json_path = output_path
      csv_path = output_path.replace('.json', '.csv')

    with open(output_path, 'w', encoding='utf-8') as f:
      json.dump(distillation_data, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(distillation_data)} samples")

    csv_path = output_path.replace('.json', '.csv')
    df.to_csv(csv_path, index=False, encoding='utf-8')
    print(f"Also saved as CSV: {csv_path}")



In [None]:
def load_and_process_dataset(csv_path: str, text_col: str = 'text', label_col: str = 'label', confidence_threshold: float = 0.3):
  print(f"Loading dataset from {csv_path}")
  df = pd.read_csv(csv_path, encoding='utf-8')

  print(f"Dataset shape: {df.shape}")
  has_hand_labels = label_col in df.columns
  if has_hand_labels:
    hand_label_dist = df[label_col].value_counts()
    print(f"Hand-labeled distribution: {dict(hand_label_dist)}")

  teacher = KriKriTeacher()

  processed_df = teacher.process_dataset(df, text_column=text_col, label_column=label_col,confidence_threshold=confidence_threshold)

  output_path = csv_path.replace('.csv', f'_teacher_labels.json')
  teacher.save_distillation_dataset(processed_df, output_path)

  print("\n=== FINAL STATISTICS ===")
  print(f"Total processed samples: {len(processed_df)}")
  print(f"Average LLM confidence: {processed_df['confidence'].mean():.3f}")
  print(f"Final label distribution: {processed_df['label_hard'].value_counts().to_dict()}")

  if has_hand_labels and 'agreement' in processed_df.columns:
    overall_agreement = processed_df['agreement'].mean()
    print(f"Overall LLM-Human Agreement: {overall_agreement:.3f}")

    disagreements = processed_df[~processed_df['agreement']]
    if len(disagreements) > 0:
      print(f"Disagreements: {len(disagreements)} samples")
      print("Sample disagreements:")
      for idx, row in disagreements.head(3).iterrows():
        print(f"  Text: '{row['text'][:100]}...'")
        print(f"  Human: {row['label_hard']}, LLM: {row['llm_label']}, Confidence: {row['confidence']:.2f}")

  return processed_df

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

csv_file_path = '/content/drive/MyDrive/cyshield/combined.csv'

processed_data = load_and_process_dataset(
    csv_path=csv_file_path,
    text_col='text',
    label_col='label',
    confidence_threshold=0.15
    )

print("Teacher labeling complete")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading dataset from /content/drive/MyDrive/cyshield/combined.csv
Dataset shape: (2569, 2)
Hand-labeled distribution: {0: np.int64(1788), 1: np.int64(781)}
Loading Llama-Krikri-8B-Instruct model...
==((====))==  Unsloth 2025.5.8: Fast Llama patching. Transformers: 4.52.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Processing 2569 messages...
Mode: Using hand labels as ground truth
Processed 10/2569 messages
Processed 20/2569 messages
Processed 30/2569 messages
Processed 40/2569 messages
Processed 50/2569 messages
Processed 60/2569 messages
Processed 70/2569 messages
Processed 80/2569 messages
Processed 90/2569 messages
Processed 100/2569 messages
Processed 110/2569 messages
Processed 120/2569 messages
Processed 130/2569 messages
Processed 140/2569 messages
Ο'
Processed 150/2569 messages
Processed 160/2569 messages
Processed 170/2569 messages
Processed 180/2569 messages
Processed 190/2569 messages
Processed 200/2569 messages
Processed 210/2569 messages
Processed 220/2569 messages
Processed 230/2569 messages
Processed 240/2569 messages
Processed 250/2569 messages
Processed 260/2569 messages
Processed 270/2569 messages
Processed 280/2569 messages
Processed 290/2569 messages
Processed 300/2569 messages
Processed 310/2569 messages
Ο'
Processed 320/2569 messages
Ο'
Processed 330/2569 messages
Processe