In [None]:
import re
import csv

# Step 1: Read the 'ramayana.txt' file
with open('ramayana.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()

# Step 2: Define chapter titles with numbers
chapter_titles = [
    "1. The Conception", "2. Sage Viswamitra", "3. Trisanku", "4. Rama Leaves Home", "5. Rama Slays The Monsters",
    "6. Sita", "7. Bhagiratha And The Story Of Ganga", "8. Ahalya", "9. Rama Wins Sita's Hand", "10. Parasurama's Discomfiture",
    "11. Festive Preparations", "12. Manthara's Evil Counsel", "13. Kaikeyi Succumbs", "14. Wife Or Demon?", "15. Behold A Wonder!",
    "16. Storm And Calm", "17. Sita's Resolve", "18. To The Forest", "19. Alone By Themselves", "20. Chitrakuta", "21. A Mother's Grief",
    "22. Idle Sport And Terrible Result", "23. Last Moments", "24. Bharata Arrives", "25. Intrigue wasted", "26. Bharata Suspected",
    "27. The Brothers Meet", "28. Bharata Becomes Rama's Deputy", "29. Viradha's End", "30. Ten Years Pass", "31. The Surpanakha Episode",
    "32. Kamban's Surpanakha", "33. Khara And His Army Liquidated", "34. The Path Of Ruin", "35. The Golden Stag", "36. The Good Bird Jatayu",
    "37. Closely Guarded", "38. Rama Disconsolate", "39. A Second Father Dies", "40. Left Eyelids Throb", "41. He Sees Her Jewels",
    "42. Sugriva's Doubts Cleared", "43. The Slaying Of Vali", "44. Tara's Grief", "45. Anger And Reconciliation", "46. The Search Begins",
    "47. Son Of Vayu", "48. The Search In Lanka", "49. Sita In The Asoka Park", "50. Ravana's Solicitation", "51. First Among The Astute",
    "52. Sita Comforted", "53. Sita And Hanuman", "54. Inviting Battle", "55. The Terrible Envoy", "56. Hanuman Bound", "57. Lanka In Flames",
    "58. A Carnival", "59. The Tidings Conveyed", "60. The Army Moves Forward", "61. Anxiety In Lanka", "62. Ravana Calls A Council Again",
    "63. Vibhishana", "64. The Vanara's Doubt", "65. Doctrine Of Surrender And Grace", "66. The Great Causeway", "67. The Battle Begins",
    "68. Sita's Joy", "69. Serpent Darts", "70. Ravana's Defeat", "71. The Giant Is Roused", "72. Is This Narayana Himself?", "73. The Death Of Indrajit",
    "74. End Of Ravana", "75. The End", "76. Epilogue"
]

# Step 3: Regular expression to split text by chapter titles with numbers (case insensitive)
pattern = r"(" + "|".join([re.escape(chapter) for chapter in chapter_titles]) + r")"
sections = re.split(pattern, raw_text, flags=re.IGNORECASE)

# Step 4: Pair up the chapter titles with their corresponding text
chapter_contents = []
for i in range(1, len(sections), 2):
    title = sections[i].strip()
    content = sections[i + 1].strip() if i + 1 < len(sections) else ""
    chapter_contents.append([title, content])

# Step 5: Save the data into a CSV file
with open('ramayana_chapters_with_numbers.csv', 'w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Chapter Title", "Content"])  # Column headers
    writer.writerows(chapter_contents)

print("CSV file 'ramayana_chapters_with_numbers.csv' has been saved successfully.")

CSV file 'ramayana_chapters_with_numbers.csv' has been saved successfully.


In [None]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Step 1: Download necessary NLTK data
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

True

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load the Ramayana data
df = pd.read_csv("ramayana_chapters_with_numbers.csv")

# Assuming the text is in a column named 'Content' (adjust if needed)
texts = df['Content'].dropna().astype(str)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(texts)

# Function to display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        keywords = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        print("Keywords:", " | ".join(keywords))

# Run topic modeling
n_topics = 10
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=100)
lda_model.fit(X)

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

# Display the topics
display_topics(lda_model, feature_names, no_top_words=10)



Topic 1:
Keywords: zeal | ploughed | drop | pastime | encouraging | nurses | entirety | merrily | maze | yogi

Topic 2:
Keywords: rama | sita | ravana | hanuman | said | king | lakshmana | like | forest | sugriva

Topic 3:
Keywords: zeal | ploughed | drop | pastime | encouraging | nurses | entirety | merrily | maze | yogi

Topic 4:
Keywords: zeal | ploughed | drop | pastime | encouraging | nurses | entirety | merrily | maze | yogi

Topic 5:
Keywords: ahalya | dravidian | entangled | partial | falsely | evoke | distinction | portrayals | engage | eunuch

Topic 6:
Keywords: trisanku | bhagiratha | amsuman | chandala | patala | sumati | kapila | exposed | maze | merrily

Topic 7:
Keywords: zeal | ploughed | drop | pastime | encouraging | nurses | entirety | merrily | maze | yogi

Topic 8:
Keywords: zeal | ploughed | drop | pastime | encouraging | nurses | entirety | merrily | maze | yogi

Topic 9:
Keywords: zeal | ploughed | drop | pastime | encouraging | nurses | entirety | merrily | ma

In [None]:
import pandas as pd

# Load your chapters data (replace with your file if needed)
chapters_df = pd.read_csv("ramayana_chapters_with_numbers.csv")

# List of female characters
female_characters = ["Sita", "Kaushalya", "Sumitra", "Kaikeyi", "Urmila",
                     "Shurpanakha", "Mandodari", "Shanta", "Tataka", "Shabari",
                     "Manthara", "Tara", "Ahalya", "Chandrabhaga", "Shrutakirti",
                     "Sunayana", "Vershini", "Romapada"]

# Lowercase version for matching
female_characters_lower = [name.lower() for name in female_characters]

# Function to check if any female character is mentioned in the content
def mentions_female_character(text):
    text_lower = str(text).lower()  # Convert to lowercase and handle NaNs
    return any(name in text_lower for name in female_characters_lower)

# Apply filtering
female_mentions_df = chapters_df[chapters_df['Content'].apply(mentions_female_character)]

# Save filtered chapters to a new CSV
female_mentions_df.to_csv("female_character_chapters.csv", index=False)

print("✅ Done! Saved chapters mentioning female characters to 'female_character_chapters.csv'")

✅ Done! Saved chapters mentioning female characters to 'female_character_chapters.csv'


In [None]:
# Load filtered chapters (only those mentioning female characters)
df1 = pd.read_csv("female_character_chapters.csv")

# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(str(text).lower())
    return ' '.join([
        lemmatizer.lemmatize(word)
        for word in tokens
        if word.isalpha() and word not in stop_words and len(word) > 2
    ])

df1['processed'] = df1['Content'].apply(preprocess)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_df=0.85, min_df=1)
X = vectorizer.fit_transform(df1['processed'])

# LDA Topic Modeling
n_topics = 10 # You can tune this
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=1000)
lda_model.fit(X)

# Display Unique Topics
def display_unique_topics(model, feature_names, no_top_words=10):
    unique_topic_sets = set()
    for topic_idx, topic in enumerate(model.components_):
        keywords = tuple([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        if keywords not in unique_topic_sets:
            unique_topic_sets.add(keywords)
            print(f"\n🔹 Topic {len(unique_topic_sets)}:")
            print("Keywords:", " | ".join(keywords))

# Get feature names and display unique topics
feature_names = vectorizer.get_feature_names_out()
display_unique_topics(lda_model, feature_names, no_top_words=10)


🔹 Topic 1:
Keywords: menaka | zeal | maniac | entirety | encouraging | sastras | saras | artfully | emerge | emerald

🔹 Topic 2:
Keywords: manthara | wreak | trusty | hurriedly | hypocritical | hobbled | tended | intimacy | foresee | insane

🔹 Topic 3:
Keywords: ravana | hanuman | forest | sugriva | bharata | shall | brother | army | lanka | word

🔹 Topic 4:
Keywords: bhagiratha | amsuman | patala | yajnavalkya | zeal | joyfully | tangled | retrieve | ransack | respected


In [None]:
# Load filtered chapters (only those mentioning female characters)
df1 = pd.read_csv("female_character_chapters.csv")

# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(str(text).lower())
    return ' '.join([
        lemmatizer.lemmatize(word)
        for word in tokens
        if word.isalpha() and word not in stop_words and len(word) > 2
    ])

df1['processed'] = df1['Content'].apply(preprocess)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_df=0.85, min_df=1)
X = vectorizer.fit_transform(df1['processed'])

# LDA Topic Modeling
n_topics = 11 # You can tune this
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=1000)
lda_model.fit(X)

# Display Unique Topics
def display_unique_topics(model, feature_names, no_top_words=10):
    unique_topic_sets = set()
    for topic_idx, topic in enumerate(model.components_):
        keywords = tuple([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        if keywords not in unique_topic_sets:
            unique_topic_sets.add(keywords)
            print(f"\n🔹 Topic {len(unique_topic_sets)}:")
            print("Keywords:", " | ".join(keywords))

# Get feature names and display unique topics
feature_names = vectorizer.get_feature_names_out()
display_unique_topics(lda_model, feature_names, no_top_words=10)


🔹 Topic 1:
Keywords: parrying | habitation | frighten | instill | slumbering | souvenir | damage | cracking | unaccountable | bodyguard

🔹 Topic 2:
Keywords: dadhimukha | parayana | furnishing | fatiguing | sheltered | keeper | recrossing | assaulted | greateful | declined

🔹 Topic 3:
Keywords: ravana | hanuman | forest | sugriva | bharata | shall | brother | army | lanka | word

🔹 Topic 4:
Keywords: bhagiratha | amsuman | bower | patala | sumati | kapila | rod | whirling | aksha | habitation

🔹 Topic 5:
Keywords: parasurama | bamboo | subsist | beehive | vie | abstemiously | unspoilt | twig | topographical | interference

🔹 Topic 6:
Keywords: ahalya | purana | lapse | entangled | untenanted | solicitation | reveled | frequent | evoke | eunuch

🔹 Topic 7:
Keywords: uttara | appreciated | legend | seed | gandhiji | parasurama | tyagaraja | behaves | rebel | drama

🔹 Topic 8:
Keywords: mahabali | tataka | conference | bowl | invitee | strictly | yagas | insolent | surrendering | distrib