In [1]:
import pandas as pd

df = pd.read_csv('trainingdataset_optimized_normalized.csv')
print(df.columns)

Index(['Datum', 'Similarity_Score', 'Input_Message', 'Output_Message',
       'Input_JSON', 'Output_JSON', 'Input_timesort', 'json_timesort'],
      dtype='object')


In [None]:
import pandas as pd
import torch
from easynmt import EasyNMT
from sentence_transformers import SentenceTransformer, util
import time
import json
import re
import io
import warnings
import nltk
nltk.download('punkt_tab')


# NOTE: Before running, you need to install the easynmt library:
# pip install easynmt

# --- 1. Back-Translation Engine (with EasyNMT) ---
# This class now uses the EasyNMT library for a simpler and more robust translation experience.

class BackTranslator:
    """
    Manages the back-translation process using the EasyNMT library.
    EasyNMT automatically downloads and caches the necessary translation models (like Opus-MT).
    """
    def __init__(self):
        """
        Initializes the EasyNMT model. It will use the GPU if available.
        EasyNMT wraps many models; 'opus-mt' is efficient and high-quality.
        """
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Attempting to use device: {self.device} for translation.")
        
        try:
            # Initialize EasyNMT. It will download the opus-mt model for the
            # required language pairs on its first run.
            self.model = EasyNMT('opus-mt')
            print("EasyNMT model loaded successfully.")
        except Exception as e:
            print(f"Error loading EasyNMT model: {e}")
            print("Please check your internet connection and if 'easynmt' is installed correctly.")
            self.model = None

    def back_translate_text(self, text):
        """
        Back-translates a single string (Slovenian -> English -> Slovenian).
        It handles multiple sentences by splitting, translating, and rejoining.
        """
        if not self.model or not isinstance(text, str) or not text.strip():
            return ""
        
        # Splitting into sentences often improves translation quality.
        sentences = re.split(r'(?<=[.?!])\s+', text.strip())
        
        # First pass: Translate from Slovenian to English
        # The translate method can take a list of sentences directly.
        intermediate_translation = self.model.translate(
            sentences,
            source_lang='sl',
            target_lang='en'
        )
        
        # Second pass: Translate from English back to Slovenian
        back_translated_sentences = self.model.translate(
            intermediate_translation,
            source_lang='en',
            target_lang='sl'
        )
        
        return " ".join(back_translated_sentences)

    def back_translate_json(self, json_str):
        """
        Recursively back-translates all string values within a JSON string,
        leaving keys and non-string values untouched.
        """
        if not self.model:
            return json_str
            
        try:
            json_obj = json.loads(json_str)
            
            # Helper function to recursively traverse the JSON object
            def traverse_and_translate(node):
                if isinstance(node, dict):
                    return {k: traverse_and_translate(v) for k, v in node.items()}
                elif isinstance(node, list):
                    return [traverse_and_translate(elem) for elem in node]
                elif isinstance(node, str):
                    # If it's a string, perform back-translation
                    return self.back_translate_text(node)
                else:
                    return node # Return non-string values as is
            
            translated_obj = traverse_and_translate(json_obj)
            return json.dumps(translated_obj, ensure_ascii=False, indent=2)
        
        except (json.JSONDecodeError, TypeError):
            # If input is not valid JSON, return it without changes.
            return json_str

# --- 2. Similarity Score Calculator ---

class SimilarityCalculator:
    """
    Calculates the semantic similarity between two texts using sentence-transformers.
    This class remains unchanged as its logic is independent of the translation library.
    """
    def __init__(self, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device} for similarity model.")
        self.model = SentenceTransformer(model_name, device=self.device)
        print("Similarity model loaded successfully.")

    def get_text_from_json(self, json_str):
        """Extracts and concatenates all text content from a JSON string."""
        try:
            data = json.loads(json_str)
            text_parts = []
            
            def extract(node):
                if isinstance(node, dict):
                    for value in node.values(): extract(value)
                elif isinstance(node, list):
                    for item in node: extract(item)
                elif isinstance(node, str):
                    text_parts.append(node)
            
            extract(data)
            return " ".join(text_parts)
        except (json.JSONDecodeError, TypeError):
            return ""

    def calculate_scores(self, df_batch):
        """Calculates cosine similarity for a DataFrame batch."""
        input_texts = df_batch['Input_Message'] + " " + df_batch['Input_JSON'].apply(self.get_text_from_json)
        output_texts = df_batch['Output_Message'] + " " + df_batch['Output_JSON'].apply(self.get_text_from_json)

        embeddings1 = self.model.encode(input_texts.tolist(), convert_to_tensor=True, show_progress_bar=False)
        embeddings2 = self.model.encode(output_texts.tolist(), convert_to_tensor=True, show_progress_bar=False)

        cosine_scores = util.cos_sim(embeddings1, embeddings2)
        
        return [cosine_scores[i][i].item() for i in range(len(df_batch))]

# --- 3. Main DataFrame Augmentation Function ---

def augment_dataframe(df: pd.DataFrame, batch_size: int = 8):
    """
    Applies the full augmentation pipeline to a DataFrame.
    Processing is done in batches for memory efficiency.
    """
    translator = BackTranslator()
    if not translator.model:
        print("Cannot proceed with augmentation due to model loading failure.")
        return pd.DataFrame()

    scorer = SimilarityCalculator()
    
    augmented_rows = []
    total_rows = len(df)
    start_time = time.time()

    print(f"\nStarting augmentation for {total_rows} rows with batch size {batch_size}...")

    for i in range(0, total_rows, batch_size):
        batch_df = df.iloc[i:i+batch_size].copy()
        
        # --- Step 1: Back-Translate ---
        batch_df['Input_Message'] = batch_df['Input_Message'].apply(translator.back_translate_text)
        batch_df['Output_Message'] = batch_df['Output_Message'].apply(translator.back_translate_text)
        batch_df['Input_JSON'] = batch_df['Input_JSON'].apply(translator.back_translate_json)
        batch_df['Output_JSON'] = batch_df['Output_JSON'].apply(translator.back_translate_json)
        
        # --- Step 2: Recalculate Similarity Score ---
        batch_df['Similarity_Score'] = scorer.calculate_scores(batch_df)
        
        augmented_rows.append(batch_df)
        
        elapsed_time = time.time() - start_time
        rows_done = min(i + batch_size, total_rows)
        print(f"Processed {rows_done}/{total_rows} rows. Time elapsed: {elapsed_time:.2f}s")

    return pd.concat(augmented_rows)

# --- Example Usage ---

# 1. Create a sample DataFrame mimicking your data structure.
csv_data = """Datum,Similarity_Score,Input_Message,Output_Message,Input_JSON,Output_JSON,Input_timesort,json_timesort
"30-11-2024 07:16:00",0.9,"Štajerska avtocesta bo zaprta med priključkoma Blagovica in Vransko.","Zaradi del bo avtocesta med Blagovico in Vranskim zaprta.","{""Popolne zapore"": ""Štajerska avtocesta bo zaprta med priključkoma Blagovica in Vransko."", ""Opozorila"": ""Obvoz je urejen.""}","{""Zastoji"": ""Na avtocesti med Blagovico in Vranskim je zapora."", ""leftover_html"": ""Voznikom priporočamo obvoz.""}","1","2"
"30-11-2024 09:30:00",0.85,"Na primorski avtocesti je pred počivališčem Ravbarkomanda oviran promet.","Promet je oviran na avtocesti proti Kopru.","{""Oviran promet"": ""Primorska avtocesta je ovirana pred Ravbarkomando proti Kopru."", ""Priporočila"": ""Vozite previdno.""}","{""Stanje na cestah"": ""Zaradi nesreče je na primorski avtocesti ovira."", ""Dodatno"": ""Pričakuje se daljši potovalni čas.""}","3","4"
"""
df_original = pd.read_csv(io.StringIO(csv_data))

# 2. Run the full augmentation pipeline.
df_augmented = augment_dataframe(df_original, batch_size=4)

# 3. Combine original and new data.
if not df_augmented.empty:
    df_final_training_set = pd.concat([df_original, df_augmented], ignore_index=True)

    # 4. Display the results for verification.
    print("\n" + "="*50)
    print("--- 1. Original Data ---")
    print("="*50)
    print(df_original.to_string())

    print("\n" + "="*50)
    print("--- 2. Augmented (Back-Translated) Data ---")
    print("--- (with newly calculated similarity scores) ---")
    print("="*50)
    print(df_augmented.to_string())

    print("\n" + "="*50)
    print("--- 3. Final Combined Training Set (Original + Augmented) ---")
    print("="*50)
    print(df_final_training_set.to_string())
else:
    print("\nAugmentation could not be completed.")


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/filipturk/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Attempting to use device: cuda for translation.
EasyNMT model loaded successfully.
Using device: cuda for similarity model.
Similarity model loaded successfully.

Starting augmentation for 2 rows with batch size 4...


OSError: Helsinki-NLP/opus-mt-sl-en is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

: 