<a href="https://colab.research.google.com/github/dope232/GenAI-Project/blob/main/Pranav_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install indic-nlp-library --quiet

!pip install langchain langchain_community faiss-cpu tqdm pandas numpy torch transformers --quiet

[0m[31mERROR: Could not find a version that satisfies the requirement indic-nlp-library (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for indic-nlp-library[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement langchain_community (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for langchain_community[0m[31m
[0m

In [None]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate
import json
import re
import pandas as pd
import numpy as np
import os
import gc
from tqdm.notebook import tqdm
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline, AutoModelForCausalLM, T5ForConditionalGeneration, AutoTokenizer
import time

if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    # For some operations, enables faster but slightly less precise computation
    torch.backends.cudnn.benchmark = True

def load_movie_dialogues(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data



def transliterate_to_devanagari(text):
    try:

        if text.isupper() and len(text.split()) == 1:
            return text

        devanagari_text = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
        return devanagari_text
    except Exception as e:
        print(f"Transliteration error: {e}")
        return text


class DialogueSceneDataset(Dataset):
    def __init__(self, scenes):
        self.scenes = scenes

    def __len__(self):
        return len(self.scenes)

    def __getitem__(self, idx):
        return self.scenes[idx]


def extract_dialogue_turns(conversation):
    dialogue_turns = []
    lines = conversation.split('\n')
    current_speaker = None
    current_line = ""

    for line in lines:
        line = line.strip()
        if not line:
            continue


        if re.match(r'^[A-Z]+$', line):
            # Save previous dialogue if exists
            if current_speaker and current_line:
                dialogue_turns.append({
                    "speaker": current_speaker,
                    "text_roman": current_line.strip(),
                    "text_devanagari": transliterate_to_devanagari(current_line.strip())
                })

            current_speaker = line
            current_line = ""
        else:

            current_line += " " + line


    if current_speaker and current_line:
        dialogue_turns.append({
            "speaker": current_speaker,
            "text_roman": current_line.strip(),
            "text_devanagari": transliterate_to_devanagari(current_line.strip())
        })

    return dialogue_turns


def generate_scene_descriptions_batch(dialogue_batch, text_generator, max_batch_size=8):
    results = []
    # Process in smaller batches to avoid CUDA memory issues
    for i in range(0, len(dialogue_batch), max_batch_size):
        sub_batch = dialogue_batch[i:i+max_batch_size]

        prompts = []
        for dialogue_turns in sub_batch:
            # Join all dialogue text
            dialogue_text = "\n".join([f"{turn['speaker']}: {turn['text_roman']}" for turn in dialogue_turns])

            # Truncate if too long
            max_length = 512
            if len(dialogue_text) > max_length:
                dialogue_text = dialogue_text[:max_length]

            # Generate prompt for the model
            prompt = f"Based on this Hindi movie dialogue, describe the scene, setting, and emotional context in one paragraph:\n\n{dialogue_text}\n\nScene description:"
            prompts.append(prompt)

        try:
            # Generate descriptions for the batch
            batch_results = text_generator(prompts, max_length=150, batch_size=len(prompts))

            # Extract generated texts
            for result in batch_results:
                if isinstance(result, list):
                    results.append(result[0]['generated_text'])
                else:
                    results.append(result['generated_text'])
        except Exception as e:
            print(f"Scene description generation error in batch: {e}")
            # Fall back to default descriptions for this batch
            for _ in range(len(sub_batch)):
                results.append("Scene description not available.")

    return results

# Function to extract emotion tags from dialogue content
def extract_emotion_tags(dialogue_turns):
    # Common Hindi emotional phrases and their corresponding tags
    emotion_patterns = {
        'angry': [r'gussa', r'naraz', r'krodh', r'chillana'],
        'happy': [r'khush', r'maza', r'anand', r'hasi', r'muskurana'],
        'sad': [r'dukh', r'udas', r'rona', r'aansu'],
        'surprised': [r'achanak', r'hairani', r'aashcharya'],
        'scared': [r'dar', r'bhay', r'ghabrana'],
        'romantic': [r'pyar', r'ishq', r'mohabbat', r'prem'],
        'serious': [r'gambhir', r'sanjeeda'],
        'confused': [r'confusion', r'samajh', r'uljhan']
    }

    full_text = " ".join([turn["text_roman"].lower() for turn in dialogue_turns])

    detected_emotions = []
    for emotion, patterns in emotion_patterns.items():
        for pattern in patterns:
            if re.search(pattern, full_text, re.IGNORECASE):
                detected_emotions.append(emotion)
                break

    return list(set(detected_emotions))

# Function to extract context tags based on dialogue content
def extract_context_tags(dialogue_turns):
    # Common Hindi contextual phrases and their corresponding tags
    context_patterns = {
        'family': [r'maa', r'baap', r'papa', r'mata', r'pita', r'bhai', r'behen', r'chacha', r'chachi', r'parivar'],
        'romance': [r'pyar', r'ishq', r'mohabbat', r'prem', r'dil', r'shadi'],
        'friendship': [r'dost', r'yaar', r'mitra', r'saathi'],
        'work': [r'kaam', r'naukri', r'office', r'boss', r'business'],
        'education': [r'school', r'college', r'padhai', r'kitab', r'teacher', r'professor'],
        'conflict': [r'ladai', r'jhagda', r'bahas', r'jung'],
        'food': [r'khana', r'restaurant', r'bhojan', r'roti', r'chai'],
        'travel': [r'safar', r'yatra', r'ghoomna', r'train', r'bus', r'car'],
        'shopping': [r'bazaar', r'dukan', r'kharidna', r'price', r'kimat'],
        'health': [r'bimari', r'doctor', r'hospital', r'tabiyat', r'dawa']
    }

    full_text = " ".join([turn["text_roman"].lower() for turn in dialogue_turns])

    detected_contexts = []
    for context, patterns in context_patterns.items():
        for pattern in patterns:
            if re.search(pattern, full_text, re.IGNORECASE):
                detected_contexts.append(context)
                break

    return list(set(detected_contexts))

# Function to extract characters and their relationships
def extract_character_relationships(dialogue_turns):
    speakers = [turn["speaker"] for turn in dialogue_turns]
    unique_speakers = list(set(speakers))

    # Count speaker interactions
    interactions = {}
    for i in range(len(dialogue_turns) - 1):
        current_speaker = dialogue_turns[i]["speaker"]
        next_speaker = dialogue_turns[i + 1]["speaker"]

        if current_speaker != next_speaker:
            pair = tuple(sorted([current_speaker, next_speaker]))
            interactions[pair] = interactions.get(pair, 0) + 1

    # Create relationships dictionary
    relationships = {}
    for speaker in unique_speakers:
        relationships[speaker] = []
        for pair, count in interactions.items():
            if speaker in pair:
                other_speaker = pair[0] if pair[1] == speaker else pair[1]
                relationships[speaker].append({"character": other_speaker, "interaction_count": count})

    return relationships

# Generate a default scene description based on emotion and context tags
def generate_default_scene_description(emotion_tags, context_tags, dialogue_turns):
    speakers = list(set([turn["speaker"] for turn in dialogue_turns]))
    speaker_str = ", ".join(speakers[:3])
    if len(speakers) > 3:
        speaker_str += f" and {len(speakers) - 3} others"

    emotion_str = ", ".join(emotion_tags) if emotion_tags else "neutral"
    context_str = ", ".join(context_tags) if context_tags else "general conversation"

    return f"A {emotion_str} scene involving {speaker_str} in a {context_str} context. The characters are engaged in dialogue that reveals their relationships and intentions."

# Process a batch of scenes
def process_batch(batch, use_pipeline=False, text_generator=None):
    processed_docs = []

    # Extract dialogue turns for all scenes in batch
    dialogue_turns_list = []
    scene_metadata = []

    for scene in batch:
        # Extract basic metadata
        scene_id = scene.get("ID", "unknown")
        movie_id = scene.get("Movie", "unknown")

        # Clean conversation text
        conversation = scene.get("Conversation", "")

        # Extract dialogue turns with speaker information
        dialogue_turns = extract_dialogue_turns(conversation)

        if not dialogue_turns:
            continue

        dialogue_turns_list.append(dialogue_turns)
        scene_metadata.append((scene_id, movie_id))

    # Generate scene descriptions in batch if using pipeline
    if use_pipeline and text_generator and dialogue_turns_list:
        try:
            scene_descriptions = generate_scene_descriptions_batch(dialogue_turns_list, text_generator)
        except Exception as e:
            print(f"Failed batch scene description generation: {e}")
            scene_descriptions = [None] * len(dialogue_turns_list)
    else:
        scene_descriptions = [None] * len(dialogue_turns_list)

    # Process the rest of the metadata and create documents
    for i, dialogue_turns in enumerate(dialogue_turns_list):
        scene_id, movie_id = scene_metadata[i]

        # Extract emotion and context tags
        emotion_tags = extract_emotion_tags(dialogue_turns)
        context_tags = extract_context_tags(dialogue_turns)

        # Use generated scene description or fall back to default
        if use_pipeline and scene_descriptions[i]:
            scene_description = scene_descriptions[i]
        else:
            scene_description = generate_default_scene_description(emotion_tags, context_tags, dialogue_turns)

        # Extract character relationships
        character_relationships = extract_character_relationships(dialogue_turns)

        # Create text versions
        roman_dialogue = "\n".join([f"{turn['speaker']}: {turn['text_roman']}" for turn in dialogue_turns])
        devanagari_dialogue = "\n".join([f"{turn['speaker']}: {turn['text_devanagari']}" for turn in dialogue_turns])
        combined_dialogue = "\n".join([
            f"{turn['speaker']}: {turn['text_roman']}\n{turn['speaker']} (देवनागरी): {turn['text_devanagari']}"
            for turn in dialogue_turns
        ])

        # Create document
        doc = Document(
            page_content=combined_dialogue,
            metadata={
                "scene_id": scene_id,
                "movie_id": movie_id,
                "scene_description": scene_description,
                "speakers": [turn["speaker"] for turn in dialogue_turns],
                "emotion_tags": emotion_tags,
                "context_tags": context_tags,
                "character_relationships": character_relationships,
                "roman_dialogue": roman_dialogue,
                "devanagari_dialogue": devanagari_dialogue,
                "dialogue_turns": [
                    {
                        "speaker": turn["speaker"],
                        "text_roman": turn["text_roman"],
                        "text_devanagari": turn["text_devanagari"]
                    } for turn in dialogue_turns
                ]
            }
        )

        processed_docs.append(doc)

    return processed_docs

# Main preprocessing function with GPU optimizations
def preprocess_movie_dialogues(data, use_pipeline=False, batch_size=16, num_workers=2):
    # Create dataset and dataloader for parallel processing
    dataset = DialogueSceneDataset(data)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        collate_fn=lambda x: x  # Don't collate, just return list of scenes
    )

    processed_documents = []

    # Set up pipeline if needed
    text_generator = None
    if use_pipeline:
        try:
            # Use 8-bit quantization to reduce GPU memory usage while maintaining quality
            text_generator = pipeline(
                'text2text-generation',
                model='google/flan-t5-small',
                device=0 if torch.cuda.is_available() else -1,
                model_kwargs={"device_map": "auto", "torch_dtype": torch.float16}
            )
        except Exception as e:
            print(f"Failed to load text generation pipeline: {e}")
            use_pipeline = False

    # Process in batches
    print(f"Processing {len(data)} scenes in batches of {batch_size}...")
    start_time = time.time()

    for i, batch in enumerate(tqdm(dataloader, desc="Processing batches")):
        # Process batch
        batch_docs = process_batch(batch, use_pipeline, text_generator)
        processed_documents.extend(batch_docs)

        # Periodically clear CUDA cache to avoid memory issues
        if torch.cuda.is_available() and i % 10 == 0:
            torch.cuda.empty_cache()
            gc.collect()

    elapsed_time = time.time() - start_time
    print(f"Preprocessing complete! Processed {len(processed_documents)} scenes in {elapsed_time:.2f} seconds")
    print(f"Average time per scene: {elapsed_time/len(processed_documents):.2f} seconds")

    return processed_documents

# Function to save processed documents to JSON with incremental saving
def save_processed_data(processed_documents, output_file, batch_size=1000):
    print(f"Saving {len(processed_documents)} documents to {output_file}...")

    # Process and save in batches to reduce memory usage
    total_batches = (len(processed_documents) + batch_size - 1) // batch_size

    for batch_idx in tqdm(range(total_batches), desc="Saving batches"):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, len(processed_documents))
        batch = processed_documents[start_idx:end_idx]

        # Convert Document objects to serializable dictionary
        serializable_docs = []
        for doc in batch:
            # Create a serializable version of metadata
            serializable_metadata = {}
            for key, value in doc.metadata.items():
                # Handle special cases like dialogue_turns that might contain non-serializable objects
                if key == "dialogue_turns":
                    serializable_metadata[key] = [
                        {
                            "speaker": turn["speaker"],
                            "text_roman": turn["text_roman"],
                            "text_devanagari": turn["text_devanagari"]
                        } for turn in value
                    ]
                elif key == "character_relationships":
                    # Convert to a simpler format if needed
                    serializable_metadata[key] = {k: [vars(item) if not isinstance(item, dict) else item for item in v]
                                                for k, v in value.items()}
                else:
                    serializable_metadata[key] = value

            serializable_doc = {
                "page_content": doc.page_content,
                "metadata": serializable_metadata
            }
            serializable_docs.append(serializable_doc)

        # Determine whether to write or append
        if batch_idx == 0:
            # First batch, create new file
            with open(output_file, 'w', encoding='utf-8') as f:
                # Write opening bracket for JSON array
                f.write("[\n")
                # Write documents with comma separation
                for i, doc in enumerate(serializable_docs):
                    json_str = json.dumps(doc, ensure_ascii=False, indent=2)
                    if i < len(serializable_docs) - 1 or total_batches > 1:
                        f.write(json_str + ",\n")
                    else:
                        f.write(json_str + "\n")
        elif batch_idx == total_batches - 1:
            # Last batch, complete the file
            with open(output_file, 'a', encoding='utf-8') as f:
                # Write documents with comma separation
                for i, doc in enumerate(serializable_docs):
                    json_str = json.dumps(doc, ensure_ascii=False, indent=2)
                    if i < len(serializable_docs) - 1:
                        f.write(json_str + ",\n")
                    else:
                        f.write(json_str + "\n")
                # Write closing bracket for JSON array
                f.write("]\n")
        else:
            # Middle batch, append to file
            with open(output_file, 'a', encoding='utf-8') as f:
                # Write documents with comma separation
                for doc in serializable_docs:
                    json_str = json.dumps(doc, ensure_ascii=False, indent=2)
                    f.write(json_str + ",\n")

        # Clear memory
        del serializable_docs
        gc.collect()

# Function to create vector store for RAG with GPU acceleration
def create_vector_store(processed_documents, index_name, batch_size=500):
    # Determine if we have enough GPU memory
    use_gpu = torch.cuda.is_available() and torch.cuda.get_device_properties(0).total_memory > 8e9  # 8GB

    device = 'cuda' if use_gpu else 'cpu'
    print(f"Creating FAISS index on {device}...")

    # Create embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name="ai4bharat/indic-bert",
        model_kwargs={"device": device},
        encode_kwargs={"device": device, "batch_size": 32}
    )

    # Process in batches to manage memory
    vector_store = None

    for i in tqdm(range(0, len(processed_documents), batch_size), desc="Creating vector index batches"):
        batch = processed_documents[i:i+batch_size]

        if vector_store is None:
            # First batch, create the store
            vector_store = FAISS.from_documents(batch, embeddings)
        else:
            # Subsequent batches, add to existing store
            batch_vector_store = FAISS.from_documents(batch, embeddings)
            vector_store.merge_from(batch_vector_store)

        # Clear GPU memory
        torch.cuda.empty_cache()
        gc.collect()

    # Save the index
    print(f"Saving vector store to {index_name}...")
    vector_store.save_local(index_name)
    return vector_store

def print_gpu_info():
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            device = torch.cuda.get_device_properties(i)
            print(f"GPU {i}: {device.name}")
            print(f"  Memory: {device.total_memory / 1e9:.2f} GB")
            mem = torch.cuda.memory_allocated(i) / 1e9
            print(f"  Memory Allocated: {mem:.2f} GB")
            mem = torch.cuda.memory_reserved(i) / 1e9
            print(f"  Memory Reserved: {mem:.2f} GB")
            mem = torch.cuda.max_memory_allocated(i) / 1e9
            print(f"  Max Memory Allocated: {mem:.2f} GB")
    else:
        print("No GPU available")

=
def main():

    print("System Information:")
    print(f"CUDA Available: {torch.cuda.is_available()}")
    print(f"PyTorch Version: {torch.__version__}")
    print_gpu_info()


    if torch.cuda.is_available():

        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9  # in GB
        batch_size = int(max(8, min(32, gpu_memory / 0.5)))  # Each scene might use ~0.5GB
        num_workers = min(4, os.cpu_count() or 1)
    else:
        batch_size = 8
        num_workers = 1

    print(f"Using batch size: {batch_size}, num_workers: {num_workers}")

    # Set device (GPU if available, else CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")


    use_pipeline = False
    try:
        # Test if pipeline works
        print("Testing text generation pipeline...")
        test_pipeline = pipeline(
            'text2text-generation',
            model='google/flan-t5-small',
            device=0 if torch.cuda.is_available() else -1,
            model_kwargs={"device_map": "auto", "torch_dtype": torch.float16}
        )
        test_result = test_pipeline("Test", max_length=10)
        print("Pipeline test successful!")
        use_pipeline = True

        # Clear memory after test
        del test_pipeline
        torch.cuda.empty_cache()
        gc.collect()
    except Exception as e:
        print(f"Failed to load text generation pipeline: {e}")
        print("Will use default scene descriptions instead.")
        input_file = "/kaggle/input/movie-dialogues/Final_Key.json"
    output_file = "/kaggle/working/processed_hindi_dialogues.json"
    index_name = "hindi_dialogue_faiss_index"


    print("Loading movie dialogue data...")
    data = load_movie_dialogues(input_file)

    print("Preprocessing movie dialogues...")
    processed_documents = preprocess_movie_dialogues(
        data,
        use_pipeline=use_pipeline,
        batch_size=batch_size,
        num_workers=num_workers
    )


    print("Saving processed data...")
    save_processed_data(processed_documents, output_file)

    print("Creating vector store for RAG...")
    vector_store = create_vector_store(processed_documents, index_name)

    print("Preprocessing complete!")
    print(f"Processed {len(processed_documents)} scenes.")
    print(f"Saved processed data to {output_file}")
    print(f"Saved vector store to {index_name}")
    print_gpu_info()

if __name__ == "__main__":
    main()

System Information:
CUDA Available: True
PyTorch Version: 2.5.1+cu121
GPU 0: Tesla T4
  Memory: 15.83 GB
  Memory Allocated: 0.45 GB
  Memory Reserved: 0.48 GB
  Max Memory Allocated: 1.05 GB
GPU 1: Tesla T4
  Memory: 15.83 GB
  Memory Allocated: 0.00 GB
  Memory Reserved: 0.00 GB
  Max Memory Allocated: 0.12 GB
Using batch size: 31, num_workers: 4
Using device: cuda
Testing text generation pipeline...
Failed to load text generation pipeline: The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please discard the `device` argument when creating your pipeline object.
Will use default scene descriptions instead.
Loading movie dialogue data...
Preprocessing movie dialogues...
Processing 1811 scenes in batches of 31...


Processing batches:   0%|          | 0/59 [00:00<?, ?it/s]

Preprocessing complete! Processed 1642 scenes in 5.04 seconds
Average time per scene: 0.00 seconds
Saving processed data...
Saving 1642 documents to /kaggle/working/processed_hindi_dialogues.json...


Saving batches:   0%|          | 0/2 [00:00<?, ?it/s]

Creating vector store for RAG...
Creating FAISS index on cuda...


Creating vector index batches:   0%|          | 0/4 [00:00<?, ?it/s]

Saving vector store to hindi_dialogue_faiss_index...
Preprocessing complete!
Processed 1642 scenes.
Saved processed data to /kaggle/working/processed_hindi_dialogues.json
Saved vector store to hindi_dialogue_faiss_index
GPU 0: Tesla T4
  Memory: 15.83 GB
  Memory Allocated: 0.45 GB
  Memory Reserved: 0.48 GB
  Max Memory Allocated: 1.05 GB
GPU 1: Tesla T4
  Memory: 15.83 GB
  Memory Allocated: 0.00 GB
  Memory Reserved: 0.00 GB
  Max Memory Allocated: 0.12 GB


In [None]:
# Install the required library
!pip install indic-transliteration --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.6/155.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h