In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\lc24801\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\lc24801\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\lc24801\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\lc24801\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\lc24801\AppData\Roaming\nltk

True

In [4]:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

In [5]:
import glob


corpus_files = glob.glob("../data/corpus_txt/mahatma-gandhi-collected-works-volume-*.txt")
print(f'Found {len(corpus_files)} files in the corpus directory.')
documents = []

for file_path in corpus_files:
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
        documents.append(text)

print(f'Loaded {len(documents)} documents from {len(corpus_files)} files.')

Found 98 files in the corpus directory.
Loaded 98 documents from 98 files.


In [6]:
# create preprocess_text function
def preprocess_text(text):
    
    # tokenize the text
    tokens = word_tokenize(text)

    # remove stop words
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # join the tokens back into a single string
    preprocessed_text = " ".join(tokens)

    return preprocessed_text

# preprocess the documents
preprocessed_documents = [preprocess_text(doc) for doc in documents]
    

In [7]:
print(preprocessed_documents[0:2])  # show the first two preprocessed documents

# create a DataFrame to hold the preprocessed documents 
df = pd.DataFrame(preprocessed_documents, columns=["text"])
# add a column for the original text
df["original_text"] = documents
# add a column for the file name
df["file_name"] = corpus_files



In [16]:
# Divide the preprocessed documents into phases
# Phase 1: 1-15
# Phase 2 : 15-62
# Phase 3: 62-99

# Dictionary to hold the phase numbers and corresponding documents  
phase_dict = {
    "Phase 1": [documents[i] for i in range(0, 15)],
    "Phase 2": [documents[i] for i in range(15, 62)],
    "Phase 3": [documents[i] for i in range(62, 98)]
}



In [17]:
# Train a word2vec model
from gensim.models import Word2Vec

# Train models by phase
for phase, docs in phase_dict.items():
    print(f"Training Word2Vec model for {phase}...")
    
    # Tokenize the preprocessed documents
    tokenized_docs = [doc.split() for doc in docs]
    
    # Create the Word2Vec model using skip-gram model
    word2vec_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, sg=1)
    
    print(f"Model for {phase} trained.")
    
    # Save the model
    model_path = f"word2vec_model_{phase.replace(' ', '_')}.model"
    word2vec_model.save(model_path)

Training Word2Vec model for Phase 1...
Model for Phase 1 trained.
Training Word2Vec model for Phase 2...
Model for Phase 2 trained.
Training Word2Vec model for Phase 3...
Model for Phase 3 trained.


In [None]:
# Save the model (just in case)
word2vec_model.save("../model/word2vec.model")