In [1]:
import nltk
from tqdm import tqdm

In [2]:
#specify the path to the Java-based MALLET software
path_to_mallet = 'C:/mallet-2.0.8/bin/mallet'

#import the above packages and libraries for working with files and the file system
import little_mallet_wrapper
import seaborn
import glob
from pathlib import Path

In [3]:
#assign to the "directory" variable the path to the directory containing our documents
m_directory = "C:/Users/media/Desktop/gobbykid/balanced_corpus/m"
f_directory = "C:/Users/media/Desktop/gobbykid/balanced_corpus/f"

#use `glob.gob()` function to make a list of all the `.txt` files in that directory.
files = glob.glob(f"{m_directory}/*.txt") + glob.glob(f"{f_directory}/*.txt")


Import MALLET default stopwords.

In [4]:
#import LMW stop words list (stored in the variable "STOPS"), in order to extend it
from little_mallet_wrapper.little_mallet_wrapper import STOPS as lmw_stopwords #!important: import from little_mallet_wrapper.little_mallet_wrapper, otherwise it will look for STOPS inside __init__.py (inside little_mallet_wrapper) and it will not find it!

Retrieve characters names from the characters extraction phase, in order to add them to the stopwords list. 

In [5]:
import csv
import ast
csv_path = "characters_from_full_corpus.csv"

retrieved_characters =[]
with open(csv_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file, delimiter=',')
        for row in csv_reader:
                retrieved_characters.extend(ast.literal_eval(row["male_characters_names"]) + ast.literal_eval(row["female_characters_names"]) + ast.literal_eval(row['unknown_gender_names']))

Define a function to split multi-word characters names 

In [6]:
def get_characters_names (characters_list):
    # empty honorifics list if you don't want to include in the model character-related words
    honorifics = []
    #honorifics = ["mr","master","mrs","miss","lady","sir","dame","lord","sister","mother","aunt","uncle","doctor","captain","father","count","professor","major","little","big","old","king","don","dr","queen","boy","girl","green","black","grey","gray","golden","silver","blue", "long"]
    output_set = set()
    #add to the previous list also the first names and last names taken by themselves (eg both "john smith" and "john" get in the list), excluding all the words in "honorifics"
    

    for item in characters_list:
        if " " in item:
            minilist = item.split()
            for w in minilist:
                if w not in honorifics:
                    output_set.add(w)
        else:
            output_set.add(item)
    
    output_list = sorted(output_set) #convert the set to a list
    return output_list

Extend the default MALLET stopwords list with characters' names and extra stopwords, to produce a custom stopwords list.

In [7]:
custom_stopwords = lmw_stopwords #initialize final list of custom stop words as containing the default stop words of lmw

more_stopwords = ["one","two","ones","say","says","said","think","thinks","thought","thing","things","go","goes","went","come","comes","came","coming","much","get","gets","got","ask","asks","asked","didn","would","could","three","other","another","until","till","upon","shall","make","made","might","must","going","way", "thou", "thee", "seems", "seem", "seemed", "never", "tell", "told", "tells", "wouldn", "tha", "like", "however","let","rather","yes", "no","little", "old", "well", "always", "never", "time", "long", "see", "saw", "sees", "illustration"]

# list of gender-related words to remove
gendered_stopwords = ['guy','dr','spokesman','chairman',"men's",'men','boy',
'boys','brother','brothers','dad','dads','dude','father',
'fathers','gentleman','gentlemen','god','grandfather','grandpa',
'grandson','groom','he','himself','his','husband','pastor','husbands','king','male','man',
'mr','nephew','nephews','priest','prince','son','sons','uncle','uncles',
'waiter','widower','widowers','lord','master',"mrs","miss","lady","sir","dame","lord","sister","mother","aunt","uncle","doctor","father","count","professor", 'heroine','drss','spokeswoman','chairwoman',"women's",'actress','women',"she's",'her','aunt','aunts','bride','daughter','daughters','female','fiancee','girl',
'girls','goddess','granddaughter','grandma','grandmother',
'herself','ladies','lady','mom','moms','mother','mothers','mrs','ms','niece',
'nieces','priestess','princess','queens','sister','sisters','waitress',
'widow','widows','wife','wives','woman','lady','mistress','queen']

characters_names = get_characters_names(retrieved_characters) #the function returns a list of strings that are (part of) a character's name

custom_stopwords.extend(more_stopwords)
custom_stopwords.extend(characters_names)
#removes words like "mr", "mrs", "mother", "father", etc.
#???????

#custom_stopwords.extend(gendered_stopwords) 

Master list of book titles in the dataset.

In [8]:
book_titles = [Path(file).stem for file in files]

**Preprocess the texts in the dataset.**

In [9]:
training_data = []

for file in tqdm(files):

    whole_text = open(file, 'r', encoding='utf-8').read() 

    # create list of pos tagged sentences of the text
    pos_tagged_sentences = [nltk.pos_tag(nltk.word_tokenize(sent)) for sent in nltk.sent_tokenize(whole_text)]
    # return all the nouns in the book, except custom stopwords

    #ATTENZIONE:
    nouns_in_book = []
    for l in pos_tagged_sentences:
        for (w, pos) in l:
            word = w.lower() # <-- re-define word as lowercase word!!
            if pos[0] == 'N' and word not in custom_stopwords and word.isascii() and not word.isnumeric():
                nouns_in_book.append(word)
    
    #questo funziona ma non è molto leggibile (vedi sopra per codice più verboso)
    # nouns_in_book = [w.lower() for l in pos_tagged_sentences for (w, pos) in l if pos[0] == 'N' and w.lower() not in custom_stopwords and (w.lower()).isascii() and not (w.lower()).isnumeric()]


    n = 1000 # number of nouns to include in each chunk
    # chunks the list of nouns for each book into x sublists with n items
    sublists_of_nouns = [nouns_in_book[i:i + n] for i in range(0, len(nouns_in_book), n)]

    chunk_pointer = 0
    for l in sublists_of_nouns:
        book_chunk = " ".join(l) # transform list to string
        training_data.append(book_chunk) #append the chunk to training data
    
        #write the book chunk to a file
        with open(f"training_data/{Path(file).stem}_chunk_{chunk_pointer}.txt", 'w', encoding='utf-8') as f:
            f.write(book_chunk)
        chunk_pointer += 1

100%|██████████| 266/266 [32:24<00:00,  7.31s/it] 


Given that processing the texts is a time-consuming operation and we might want to train our model several times in order to chose the best parameters, we stored the output of the text processing in files inside training_data. We can use the following function to re-create a variable for each chunk, retrieving the ready-to-use texts directly, i.e. skipping the processing phase, the next time we use this program.

In [10]:
#re-create the training data variable from saved files

training_data = []
processed_files_paths = glob.glob("training_data/*.txt") #creates a list with the file paths of the processed texts
for processed_text in processed_files_paths:
    ready_text = open(processed_text, 'r', encoding='utf-8').read()
    training_data.append(ready_text)

## Train the topic model

In [11]:
#specify the number of topics we want returned
num_topics = 40

#set desired output directory/ies (will be created inside current directory)
output_directory_path = 'topic-model-output/all_results'


#set topic model output files! No need to change anything below here
Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

path_to_training_data           = f"{output_directory_path}/training.txt"
path_to_formatted_training_data = f"{output_directory_path}/mallet.training"
path_to_model                   = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys              = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions     = f"{output_directory_path}/mallet.topic_distributions.{str(num_topics)}"



Train the model with `little_mallet_wrapper.quick_train_topic_model()`.

In [12]:

little_mallet_wrapper.quick_train_topic_model(path_to_mallet,
                                             output_directory_path,
                                             num_topics,
                                             training_data)

Importing data...
Complete
Training topic model...
Complete


([['course',
   'children',
   'others',
   'something',
   'anything',
   'book',
   'voice',
   'sort',
   'gentleman',
   'ring',
   'right',
   'books',
   'everyone',
   'tea',
   'kind',
   'look',
   'girls',
   'everything',
   'police',
   'silence'],
  ['door',
   'house',
   'room',
   'night',
   'window',
   'bed',
   'floor',
   'morning',
   'kitchen',
   'chair',
   'windows',
   'stairs',
   'wall',
   'passage',
   'clothes',
   'anything',
   'dinner',
   'face',
   'servants',
   'steps'],
  ['men',
   'oxford',
   'tutor',
   'town',
   'friend',
   'crowd',
   'rooms',
   'friends',
   'street',
   'year',
   'sort',
   'streets',
   'party',
   'university',
   'fellow',
   'days',
   'clerks',
   'company',
   'hero',
   'river'],
  ['wife',
   'years',
   'house',
   'husband',
   'mind',
   'anything',
   'dear',
   'life',
   'evening',
   'nothing',
   'room',
   'ladies',
   'deal',
   'marriage',
   'gentleman',
   'sisters',
   'year',
   'pleasure',
   '

Let's train the model again, this time setting 20 as the number of topics.

In [13]:
num_topics = 20

little_mallet_wrapper.quick_train_topic_model(path_to_mallet,
                                             output_directory_path,
                                             num_topics,
                                             training_data)

Importing data...
Complete
Training topic model...
Complete


([['friend',
   'letter',
   'years',
   'life',
   'friends',
   'nothing',
   'gentleman',
   'anything',
   'mind',
   'subject',
   'pleasure',
   'manner',
   'year',
   'part',
   'deal',
   'men',
   'interest',
   'ladies',
   'town',
   'company'],
  ['mrs',
   'money',
   'face',
   'house',
   'letter',
   'room',
   'day',
   'child',
   'dear',
   'nothing',
   'heart',
   'morning',
   'street',
   'children',
   'course',
   'kind',
   'anything',
   'moment',
   'door',
   'voice'],
  ['men',
   'house',
   'day',
   'face',
   'days',
   'life',
   'heart',
   'brothers',
   'none',
   'england',
   'peril',
   'forth',
   'nay',
   'youth',
   'court',
   'walls',
   'country',
   'hast',
   'years',
   'news'],
  ['heart',
   'child',
   'face',
   'day',
   'life',
   'tears',
   'mind',
   'voice',
   'nothing',
   'children',
   'wife',
   'church',
   'room',
   'days',
   'work',
   'word',
   'soul',
   'truth',
   'thoughts',
   'years'],
  ['deck',
   'men',
