In [None]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Download necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

**Title:** Comparative study between two novels (Anita Desai’s “Fasting & Feasting” and Amitav Ghosh’s “The Calcutta Chromosome ”) using digital tools like - Word Clustering , NER, TopicModeling

**Research Question**

1."How are gendered experiences and emotional expressions constructed and contrasted in Anita Desai’s Fasting, Feasting and Amitav Ghosh’s The Calcutta Chromosome, and what do these constructions reveal about cultural narratives in Indian English literature?"

2."In what ways are gendered identities thematically linked to space, emotion, and agency in Fasting, Feasting and The Calcutta Chromosome, and how can digital methods reveal these links?”

In [None]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [None]:
# Step 1: Preprocess function (using NLTK)
def preprocess_nltk(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))  # Default NLTK stopwords
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

# Read your uploaded file (replace 'your_file.txt' with actual filename)
filename = "[Amitav_Ghosh]_The_Calcutta_Chromosome_A_Novel_of(BookFi).txt"  # Replace with actual filename
with open(filename, "r", encoding="utf-8") as f:
    text = f.read()

# Split into paragraphs (or chapters)
paragraphs = [p.strip() for p in text.split("\n") if len(p.strip()) > 30]  # Avoid tiny paragraphs

# Preprocess each paragraph with NLTK
processed_paragraphs = [preprocess_nltk(para) for para in paragraphs]

In [None]:
# Step 2: CountVectorizer to convert text to a document-term matrix
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(processed_paragraphs)

In [None]:
# Step 3: LDA Topic Modeling
num_topics = 5  # Start with 2 or 3 topics
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(doc_term_matrix)

In [None]:
# Step 4: Display Topics
feature_names = vectorizer.get_feature_names_out()

def display_topics(model, feature_names, no_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}: ", " | ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Show topics
display_topics(lda_model, feature_names, 10)


Topic 1:  antar | said | man | hand | like | urmila | room | face | eyes | open

Topic 2:  said | know | murugan | urmila | going | way | tell | want | time | think

Topic 3:  said | sonali | urmila | phulboni | door | room | murugan | went | looked | farley

Topic 4:  said | murugan | antar | malaria | like | know | ross | cunningham | time | right

Topic 5:  murugan | said | antar | urmila | began | mrs | aratounian | old | ronnie | head


The topic modeling analysis of The Calcutta Chromosome by Amitav Ghosh reveals a rich interplay between science, mystery, and fragmented personal narratives, reflecting the novel's postmodern structure and themes. Central to each topic are key figures like Antar, Murugan, Urmila, and Ross, pointing to the interconnected roles of observation, research, and identity across time and place. Topic 1 focuses on man, hand, face, and eyes, suggesting themes of physical embodiment and surveillance, reinforcing how bodies become sites of knowledge and experimentation. Topic 2, with terms like going, way, tell, and think, reflects a cognitive and narrative journey—Murugan’s obsessive search for truth and the epistemological quest that drives the novel forward. Topic 3’s inclusion of Phulboni, door, and room emphasizes spatial transitions and hidden knowledge, possibly tied to mystical or alternative histories. Topic 4 integrates malaria, Ross, and Cunningham, directly referencing the historical scientific backdrop of the novel, and brings out the tension between colonial research and indigenous knowledge systems. Topic 5 consolidates the presence of figures like Mrs. Aratounian and Ronnie, with words like old, head, and began, anchoring the narrative in institutional memory and generational shifts. Overall, these topics point to a text that negotiates between gendered voices, colonial legacies, and subaltern agency, using fragmented storytelling and layered character perspectives to deconstruct scientific authority and emphasize hidden, often feminized, modes of knowledge transmission.

