In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/dushyantgoel/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/dushyantgoel/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/dushyantgoel/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/dushyantgoel/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/dushyantgoel/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru

True

In [3]:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

In [4]:
import glob


corpus_files = glob.glob("../data/corpus_txt/mahatma-gandhi-collected-works-volume-*.txt")
print(f'Found {len(corpus_files)} files in the corpus directory.')
documents = []

for file_path in corpus_files:
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
        documents.append(text)

print(f'Loaded {len(documents)} documents from {len(corpus_files)} files.')

Found 98 files in the corpus directory.
Loaded 98 documents from 98 files.


In [5]:
# create preprocess_text function
def preprocess_text(text):
    
    # tokenize the text
    tokens = word_tokenize(text)

    # remove stop words
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # join the tokens back into a single string
    preprocessed_text = " ".join(tokens)

    return preprocessed_text

# preprocess the documents
preprocessed_documents = [preprocess_text(doc) for doc in documents]
    

In [6]:
print(preprocessed_documents[0:2])  # show the first two preprocessed documents



In [7]:
# Divide the preprocessed documents into phases
# Phase 1: 1-15
# Phase 2 : 15-62
# Phase 3: 62-99

# Dictionary to hold the phase numbers and corresponding documents  
phase_dict = {
    "phase_1": [documents[i] for i in range(0, 15)],
    "phase_2": [documents[i] for i in range(15, 62)],
    "phase_3": [documents[i] for i in range(62, 98)]
}



Final Model Training

In [None]:
# Train a word2vec model
from gensim.models import Word2Vec

# Train models by phase
for phase, docs in phase_dict.items():
    print(f"Training Word2Vec model for {phase}...")
    
    # Tokenize the preprocessed documents
    tokenized_docs = [doc.split() for doc in docs]
    
    # Create the Word2Vec model using skip-gram model
    word2vec_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, sg=1)
    
    print(f"Model for {phase} trained.")

    # Save the model
    model_path = f"../model/word2vec_model_{phase}.model"
    word2vec_model.save(model_path)

    print(f"Model for {phase} saved to {model_path}.")

Training Word2Vec model for phase_1...
Model for phase_1 trained.
Training Word2Vec model for phase_2...
Model for phase_2 trained.
Training Word2Vec model for phase_3...
Model for phase_3 trained.


Hyperparameter tuning - to get stable embeddings.

In [None]:
# from gensim.models import Word2Vec
# import numpy as np
# from collections import defaultdict 

# # Hyperparameters
# # vector_size : Larger sizes can capture more nuanced patterns but may introduce more noise / instability on small data
# # window: Smaller windows (2–5) capture syntactic/contextual similarity, larger windows (5–10) capture topical similarity
# # min_count: Removes rare words, improving model reliability but reducing vocabulary — higher values often improve stability
# # epochs (epochs or iter): More passes over the corpus increase stability and convergence
# # negative: Number of negative samples — increasing improves model robustness on sparse data
# # seed: Random seed affects initial weight assignment — important for testing stability across runs
# # workers: Not affecting stability, just speed (but can introduce non-determinism in multi-threaded training if seed is not fixed)

# def train_model(tokenized_docs, vector_size, window, min_count, epochs, seed):
    
#     model = Word2Vec(
#         sentences=tokenized_docs,
#         vector_size=vector_size, # embedding size
#         window=window, # context window size
#         min_count=min_count, # rare words
#         epochs=epochs, # training epochs
#         sg=1, # skip-gram
#         negative=5,
#         seed=seed, # for deterministic training
#         workers=1  # for deterministic training
#     )
#     return model

# word = "swaraj"
# top_n = 10

# hyperparameters = []
# # Enumerate over vector sizes
# for vector_size in range(150, 500, 50):
#     hyperparameters.append({
#         'vector_size': vector_size,
#         'window': 5,
#         'min_count': 2,
#         'epochs': 10
#     })

# for i, setting in enumerate(hyperparameters):
#     model_dict = {}
#     for phase, docs in phase_dict.items():
    
#         print(f"Training Word2Vec model for {phase}...")
#         # Tokenize the preprocessed documents
#         tokenized_docs = [doc.split() for doc in docs]
    
#         model = train_model(
#             tokenized_docs,
#             vector_size=setting['vector_size'],
#             window=setting['window'],
#             min_count=setting['min_count'],
#             epochs=setting['epochs'],
#             seed=42
#         )

#         # Save the model (in - memory)
#         model_dict[phase] = model
    
#     # Now load the models and find the jaccard similarity
    
#     neighbors = defaultdict(dict)
#     jaccard_results = defaultdict(dict)

#     phases = ["phase_1", "phase_2", "phase_3"]
    
#     for i in range(len(phases) - 1):
        
#         p1, p2 = phases[i], phases[i + 1]
#         set1, set2 = neighbors[word].get(p1, set()), neighbors[word].get(p2, set())
        
#         if set1 and set2:
#             intersection = len(set1 & set2)
#             union = len(set1 | set2)

#             jaccard_sim = intersection / union
#             jaccard_results[word][f"{p1}_vs_{p2}"] = jaccard_sim
#         else:
#             jaccard_results[word][f"{p1}_vs_{p2}"] = None

#     # Print results
#     print(f"\nJaccard Similarity Results for '{setting}':")
#     for word, results in jaccard_results.items():
#         print(f"\nKeyword: '{word}'")
#         for pair, score in results.items():
#             print(f"{pair}: {score:.3f}" if score is not None else f"{pair}: N/A")

    
#     # if keyword in model.wv:
#     #     neighbors = model.wv.most_similar(keyword, topn=top_n)
#     #     print(f"\nRun {i+1} | Settings: {setting}")
#     #     for word, sim in neighbors:
#     #         print(f"{word}: {sim:.3f}")

#     #     # Compare with baseline
#     #     jaccard_score = jaccard_similarity(baseline_neighbors, neighbors)
#     #     print(f"Jaccard Similarity with baseline: {jaccard_score:.3f}")
#     # else:
#     #     print(f"\nRun {i+1} | Settings: {setting} | '{keyword}' not in vocab")



Training Word2Vec model for phase_1...


KeyboardInterrupt: 