<a href="https://colab.research.google.com/github/dp22acn/Data_Science_Project/blob/main/Characters_extraction_For_Automated_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Task 1 : Extracting the Character Names from Mahabharata Books**

In [None]:
pip install spacy

In [None]:
pip install rapidfuzz

In [None]:
from google.colab import drive
drive.mount('/content/drive')

This Gives Top characters with count

In [None]:
import os
import re
import spacy
from collections import Counter

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1500000  # Increase limit to 1.5M characters (adjust as needed)

# Function to preprocess text
def preprocess_text(text):
    """
    Cleans the input text by removing special characters and extra spaces.
    """
    text = re.sub(r'[^\w\s]', '', text)  # this Removes special characters
    text = re.sub(r'\s+', ' ', text).strip()  # this Removes extra spaces
    return text

# Function to split text into smaller chunks
def split_into_chunks(text, max_chunk_size=1000000):
    """
    Splits the text into chunks smaller than max_chunk_size for processing.
    """
    return [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

# Function to extract character names from text chunks
def extract_characters_from_chunk(text_chunk):
    """
    Extracts character names from a text chunk using SpaCy's Named Entity Recognition (NER).
    """
    doc = nlp(text_chunk)
    return [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

# Process all books in a folder and count name occurrences
def process_books_count_names(folder_path):
    """
    Processes all text files in the folder, extracts character names, and counts their occurrences.
    """
    name_counter = Counter()

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):  # Ensure only text files are processed
            file_path = os.path.join(folder_path, file_name)
            print(f"Processing {file_path}...")

            # Read and preprocess the book content
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
                clean_text = preprocess_text(text)

            # Split text into chunks to avoid exceeding nlp.max_length
            text_chunks = split_into_chunks(clean_text, max_chunk_size=nlp.max_length)
            for chunk in text_chunks:
                extracted_names = extract_characters_from_chunk(chunk)
                name_counter.update(extracted_names)

    return name_counter

# Filter top names by frequency
def get_top_names(name_counter, min_occurrences=20):
    """
    Filters names that occur at least 'min_occurrences' times.
    """
    return {name: count for name, count in name_counter.items() if count >= min_occurrences}

# Main script
if __name__ == "__main__":
    # Path to the folder containing the Mahabharata books
    folder_path = "/content/drive/MyDrive/mahatxt"  # Update this with the actual path

    # Process books and count character name occurrences
    name_counter = process_books_count_names(folder_path)

    # Filter the most frequently occurring names
    top_names = get_top_names(name_counter, min_occurrences=5)  # Adjust threshold as needed

    # Save results to a file or display
    output_file = "top_characters_with_counts.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        for name, count in sorted(top_names.items(), key=lambda x: x[1], reverse=True):
            f.write(f"{name}: {count}\n")

    print(f"Character extraction complete. Results saved to {output_file}.")


In [None]:
import re
import pandas as pd

def process_character_names(file_path, output_path):
    temp_result = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Remove numbers and trailing "n" or spaces
            cleaned_line = re.sub(r':\s*\d+[n]?', '', line).strip()

            # Split the line by commas
            names = [name.strip() for name in cleaned_line.split(',')]

            # Ensure the first name is the label
            if names:
                label = names[0]
                alternative_names = names[1:]
                temp_result.append({"label": label, "alternative_names": alternative_names})

    # Combine rows with the same label
    result = {}
    for entry in temp_result:
        label = entry['label']
        alt_names = entry['alternative_names']

        if label not in result:
            result[label] = set(alt_names)  # Use a set to avoid duplicates
        else:
            result[label].update(alt_names)  # Merge alternative names for repeated labels

    # Convert back to a structured format and remove duplicates from alternatives
    final_result = []
    for label, alt_names in result.items():
        # Remove duplicates from alternative names and ensure no label appears in alternatives
        alt_names = sorted(set(alt_names) - {label})
        final_result.append({"label": label, "alternative_names": ", ".join(alt_names)})

    # Save results to a CSV
    df = pd.DataFrame(final_result)
    df.to_csv(output_path, index=False)
    print(f"Processed names saved to {output_path}.")
    return final_result

# Input and output file paths
file_path = "top_characters_with_counts.txt"
output_path = "/content/processed_names.csv"

# Process the file and save results
processed_data = process_character_names(file_path, output_path)

# Display the output
for entry in processed_data:
    print(f"Label: {entry['label']}, Alternative Names: {entry['alternative_names']}")
