In [4]:
import pandas as pd

df = pd.read_csv('data/trainingdataset_optimized_normalized.csv')
print(df.columns)

Index(['Datum', 'Similarity_Score', 'Input_Message', 'Output_Message',
       'Input_JSON', 'Output_JSON', 'Input_timesort', 'json_timesort'],
      dtype='object')


In [None]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer
import time

# --- 1. Setup the Back-Translation Engine ---

class BackTranslator:
    def __init__(self, model_name_fwd='Helsinki-NLP/opus-mt-sl-en', model_name_bwd='Helsinki-NLP/opus-mt-en-sl'):
        """
        Initializes the forward (SL->EN) and backward (EN->SL) translation models.
        
        Args:
            model_name_fwd (str): The name of the Slovenian to English translation model on Hugging Face.
            model_name_bwd (str): The name of the English to Slovenian translation model.
        """
        # Check for GPU availability
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")

        # Load forward model (Slovenian to English)
        print(f"Loading forward model: {model_name_fwd}")
        self.tokenizer_fwd = MarianTokenizer.from_pretrained(model_name_fwd)
        self.model_fwd = MarianMTModel.from_pretrained(model_name_fwd).to(self.device)

        # Load backward model (English to Slovenian)
        print(f"Loading backward model: {model_name_bwd}")
        self.tokenizer_bwd = MarianTokenizer.from_pretrained(model_name_bwd)
        self.model_bwd = MarianMTModel.from_pretrained(model_name_bwd).to(self.device)

    def translate(self, texts, model, tokenizer):
        """Helper function to perform translation on a batch of texts."""
        # Tokenize the texts
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
        
        # Generate translated tokens
        translated_tokens = model.generate(**inputs)
        
        # Decode the tokens back to text
        return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

    def back_translate_batch(self, texts):
        """
        Performs a round-trip translation on a batch of texts.
        SL -> EN -> SL
        """
        if not texts or not any(t.strip() for t in texts):
            return [""] * len(texts) # Return empty strings for empty input
            
        # Forward translation
        english_texts = self.translate(texts, self.model_fwd, self.tokenizer_fwd)
        
        # Backward translation
        slovenian_texts_back = self.translate(english_texts, self.model_bwd, self.tokenizer_bwd)
        
        return slovenian_texts_back

# --- 2. Main DataFrame Processing Function ---

def augment_with_back_translation(df: pd.DataFrame, columns_to_translate: list, batch_size: int = 16):
    """
    Applies back-translation to specified columns of a DataFrame to create augmented data.
    
    Args:
        df (pd.DataFrame): The input DataFrame.
        columns_to_translate (list): A list of column names to apply back-translation to.
        batch_size (int): The number of rows to process at once. Helps manage memory.
        
    Returns:
        pd.DataFrame: A new DataFrame containing the original and augmented (back-translated) rows.
    """
    # Initialize the translation engine
    translator = BackTranslator()
    
    # Create a copy of the original data to be the basis for our augmented data
    augmented_df = df.copy()
    
    total_rows = len(df)
    start_time = time.time()
    
    # Process the DataFrame in batches
    for i in range(0, total_rows, batch_size):
        batch = df.iloc[i:i+batch_size]
        
        for col in columns_to_translate:
            # Get the text from the batch for the current column
            texts_to_translate = batch[col].tolist()
            
            # Perform back-translation
            back_translated_texts = translator.back_translate_batch(texts_to_translate)
            
            # Update the corresponding column in the augmented DataFrame
            # Use .iloc indexer to ensure correct assignment
            augmented_df.iloc[i:i+batch_size, df.columns.get_loc(col)] = back_translated_texts

        # Progress update
        elapsed_time = time.time() - start_time
        rows_done = min(i + batch_size, total_rows)
        print(f"Processed {rows_done}/{total_rows} rows. Time elapsed: {elapsed_time:.2f}s")
        
    return augmented_df

# --- Example Usage ---

# 1. Create a sample DataFrame (in a real scenario, you'd load your full df)
data = {
    'Input_Message_Normalized': [
        "Štajerska avtocesta bo zaprta med priključkoma Blagovica in Vransko do pojutrišnjem.",
        "Dela bodo potekala tudi jutri."
    ],
    'Output_Message_Normalized': [
        "Zaradi del bo avtocesta med Blagovico in Vranskim zaprta do ponedeljka.",
        "Dela se bodo nadaljevala jutri."
    ],
    # Add other columns to ensure they are preserved
    'Datum': ['30-11-2024 07:16:00', '01-12-2024 09:00:00'],
    'Input_timesort': [1, 2]
}
df = pd.DataFrame(data)

# 2. Define which columns you want to augment
# It's best to use the final, normalized columns for this
columns_to_augment = ['Input_Message_Normalized', 'Output_Message_Normalized']

# 3. Run the augmentation
# For a large DataFrame, this will take time. Run it on a subset first to test.
# df_augmented = augment_with_back_translation(df.head(2), columns_to_augment) # Test with 2 rows
df_augmented = augment_with_back_translation(df, columns_to_augment) # Run on the full sample

# 4. Combine original and augmented data for the final training set
df_final_training_set = pd.concat([df, df_augmented], ignore_index=True)

# Display the results
print("\n--- Original Data ---")
display(df)

print("\n--- Augmented (Back-Translated) Data ---")
display(df_augmented)

print("\n--- Final Combined Training Set (Original + Augmented) ---")
display(df_final_training_set)

Using device: cpu
Loading forward model: Helsinki-NLP/opus-mt-sl-en


ImportError: 
MarianTokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


: 

In [None]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer
from sentence_transformers import SentenceTransformer, util
import time
import json
import re
import io

# --- 1. Back-Translation Engine ---

class BackTranslator:
    def __init__(self, model_name_fwd='Helsinki-NLP/opus-mt-sl-en', model_name_bwd='Helsinki-NLP/opus-mt-en-sl'):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device} for translation models.")
        self.tokenizer_fwd = MarianTokenizer.from_pretrained(model_name_fwd)
        self.model_fwd = MarianMTModel.from_pretrained(model_name_fwd).to(self.device)
        self.tokenizer_bwd = MarianTokenizer.from_pretrained(model_name_bwd)
        self.model_bwd = MarianMTModel.from_pretrained(model_name_bwd).to(self.device)

    def translate(self, texts, model, tokenizer):
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
        translated_tokens = model.generate(**inputs)
        return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

    def back_translate_text(self, text):
        """Back-translates a single string, handling multiple sentences inside it."""
        if not isinstance(text, str) or not text.strip():
            return ""
        
        # Split into sentences, translate each, then rejoin. This is more robust.
        sentences = re.split(r'(?<=[.?!])\s+', text.strip())
        back_translated_sentences = self.translate(self.translate(sentences, self.model_fwd, self.tokenizer_fwd), self.model_bwd, self.tokenizer_bwd)
        return " ".join(back_translated_sentences)

    def back_translate_json(self, json_str):
        """Back-translates all string values within a JSON string."""
        try:
            json_obj = json.loads(json_str)
            
            # Recursive helper function
            def traverse_and_translate(node):
                if isinstance(node, dict):
                    return {k: traverse_and_translate(v) for k, v in node.items()}
                elif isinstance(node, list):
                    return [traverse_and_translate(elem) for elem in node]
                elif isinstance(node, str):
                    return self.back_translate_text(node)
                else:
                    return node
            
            translated_obj = traverse_and_translate(json_obj)
            return json.dumps(translated_obj, ensure_ascii=False, indent=2)
        
        except (json.JSONDecodeError, TypeError):
            # If JSON is invalid, return it as is.
            return json_str

# --- 2. Similarity Score Calculator ---

class SimilarityCalculator:
    def __init__(self, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device} for similarity model.")
        self.model = SentenceTransformer(model_name, device=self.device)

    def get_text_from_json(self, json_str):
        """Extracts all text content from a JSON string."""
        try:
            data = json.loads(json_str)
            text_parts = []
            # Recursive helper
            def extract(node):
                if isinstance(node, dict):
                    for value in node.values(): extract(value)
                elif isinstance(node, list):
                    for item in node: extract(item)
                elif isinstance(node, str):
                    text_parts.append(node)
            extract(data)
            return " ".join(text_parts)
        except (json.JSONDecodeError, TypeError):
            return ""

    def calculate_scores(self, df_batch):
        """Calculates similarity for a batch DataFrame."""
        # Consolidate all text from Input and Output for a comprehensive comparison
        input_texts = df_batch['Input_Message'] + " " + df_batch['Input_JSON'].apply(self.get_text_from_json)
        output_texts = df_batch['Output_Message'] + " " + df_batch['Output_JSON'].apply(self.get_text_from_json)

        # Encode the texts
        embeddings1 = self.model.encode(input_texts.tolist(), convert_to_tensor=True, show_progress_bar=False)
        embeddings2 = self.model.encode(output_texts.tolist(), convert_to_tensor=True, show_progress_bar=False)

        # Calculate cosine similarity
        cosine_scores = util.cos_sim(embeddings1, embeddings2)
        return [cosine_scores[i][i].item() for i in range(len(df_batch))]

# --- 3. Main DataFrame Augmentation Function ---

def augment_dataframe(df: pd.DataFrame, batch_size: int = 8):
    """
    Performs back-translation on text and JSON fields, then recalculates similarity scores.
    """
    translator = BackTranslator()
    scorer = SimilarityCalculator()
    
    augmented_rows = []
    total_rows = len(df)
    start_time = time.time()

    for i in range(0, total_rows, batch_size):
        batch_df = df.iloc[i:i+batch_size].copy()
        
        # --- Back-Translate ---
        batch_df['Input_Message'] = batch_df['Input_Message'].apply(translator.back_translate_text)
        batch_df['Output_Message'] = batch_df['Output_Message'].apply(translator.back_translate_text)
        batch_df['Input_JSON'] = batch_df['Input_JSON'].apply(translator.back_translate_json)
        batch_df['Output_JSON'] = batch_df['Output_JSON'].apply(translator.back_translate_json)
        
        # --- Recalculate Similarity ---
        batch_df['Similarity_Score'] = scorer.calculate_scores(batch_df)
        
        augmented_rows.append(batch_df)
        
        # Progress update
        elapsed_time = time.time() - start_time
        rows_done = min(i + batch_size, total_rows)
        print(f"Processed {rows_done}/{total_rows} rows. Time elapsed: {elapsed_time:.2f}s")

    return pd.concat(augmented_rows)

# --- Example Usage ---

# 1. Create a sample DataFrame with the correct structure
csv_data = """Datum,Similarity_Score,Input_Message,Output_Message,Input_JSON,Output_JSON,Input_timesort,json_timesort
"30-11-2024 07:16:00",0.9,"Štajerska avtocesta bo zaprta med priključkoma Blagovica in Vransko.","Zaradi del bo avtocesta med Blagovico in Vranskim zaprta.","{""Popolne zapore"": ""Štajerska avtocesta bo zaprta med priključkoma Blagovica in Vransko."", ""Opozorila"": ""Obvoz je urejen.""}","{""Zastoji"": ""Na avtocesti med Blagovico in Vranskim je zapora."", ""leftover_html"": ""Voznikom priporočamo obvoz.""}","1","2"
"""
df = pd.read_csv(io.StringIO(csv_data))

# 2. Run the full augmentation pipeline (on the sample)
df_augmented = augment_dataframe(df, batch_size=4)

# 3. Combine original and new data
df_final_training_set = pd.concat([df, df_augmented], ignore_index=True)

# 4. Display the results
print("\n--- Original Data ---")
display(df)

print("\n--- Augmented (Back-Translated) Data with New Similarity Score ---")
display(df_augmented)

print("\n--- Final Combined Training Set (Original + Augmented) ---")
display(df_final_training_set)