In [72]:
import nltk
from tqdm import tqdm
import pandas as pd
from pprint import pprint
import csv

In [54]:
#specify the path to the Java-based MALLET software
path_to_mallet = 'C:/mallet-2.0.8/bin/mallet'

#import the above packages and libraries for working with files and the file system
import little_mallet_wrapper
import seaborn
import glob
from pathlib import Path

In [55]:
#assign to the "directory" variable the path to the directory containing our documents
m_directory = "C:/Users/media/Desktop/gobbykid/balanced_corpus/m"
f_directory = "C:/Users/media/Desktop/gobbykid/balanced_corpus/f"

#use `glob.gob()` function to make a list of all the `.txt` files in that directory.
files = glob.glob(f"{m_directory}/*.txt") + glob.glob(f"{f_directory}/*.txt")


Import MALLET default stopwords.

In [56]:
#import LMW stop words list (stored in the variable "STOPS"), in order to extend it
from little_mallet_wrapper.little_mallet_wrapper import STOPS as lmw_stopwords #!important: import from little_mallet_wrapper.little_mallet_wrapper, otherwise it will look for STOPS inside __init__.py (inside little_mallet_wrapper) and it will not find it!

Retrieve characters names from the characters extraction phase, in order to add them to the stopwords list. 

In [57]:
import csv
import ast
csv_path = "characters_from_full_corpus.csv"

retrieved_characters =[]
with open(csv_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file, delimiter=',')
        for row in csv_reader:
                retrieved_characters.extend(ast.literal_eval(row["male_characters_names"]) + ast.literal_eval(row["female_characters_names"]) + ast.literal_eval(row['unknown_gender_names']))

Define a function to split multi-word characters names 

In [58]:
def get_characters_names (characters_list):
    # empty honorifics list if you don't want to include in the model character-related words
    honorifics = []
    #honorifics = ["mr","master","mrs","miss","lady","sir","dame","lord","sister","mother","aunt","uncle","doctor","captain","father","count","professor","major","little","big","old","king","don","dr","queen","boy","girl","green","black","grey","gray","golden","silver","blue", "long"]
    output_set = set()
    #add to the previous list also the first names and last names taken by themselves (eg both "john smith" and "john" get in the list), excluding all the words in "honorifics"
    

    for item in characters_list:
        if " " in item:
            minilist = item.split()
            for w in minilist:
                if w not in honorifics:
                    output_set.add(w)
        else:
            output_set.add(item)
    
    output_list = sorted(output_set) #convert the set to a list
    return output_list

Extend the default MALLET stopwords list with characters' names and extra stopwords, to produce a custom stopwords list.

In [59]:
custom_stopwords = lmw_stopwords #initialize final list of custom stop words as containing the default stop words of lmw

more_stopwords = ["one","two","ones","say","says","said","think","thinks","thought","thing","things","go","goes","went","come","comes","came","coming","much","get","gets","got","ask","asks","asked","didn","would","could","three","other","another","until","till","upon","shall","make","made","might","must","going","way", "thou", "thee", "seems", "seem", "seemed", "never", "tell", "told", "tells", "wouldn", "tha", "like", "however","let","rather","yes", "no","little", "old", "well", "always", "never", "time", "long", "see", "saw", "sees", "illustration"]

# list of gender-related words to remove
gendered_stopwords = ['guy','dr','spokesman','chairman',"men's",'men','boy',
'boys','brother','brothers','dad','dads','dude','father',
'fathers','gentleman','gentlemen','god','grandfather','grandpa',
'grandson','groom','he','himself','his','husband','pastor','husbands','king','male','man',
'mr','nephew','nephews','priest','prince','son','sons','uncle','uncles',
'waiter','widower','widowers','lord','master',"mrs","miss","lady","sir","dame","lord","sister","mother","aunt","uncle","doctor","father","count","professor", 'heroine','drss','spokeswoman','chairwoman',"women's",'actress','women',"she's",'her','aunt','aunts','bride','daughter','daughters','female','fiancee','girl',
'girls','goddess','granddaughter','grandma','grandmother',
'herself','ladies','lady','mom','moms','mother','mothers','mrs','ms','niece',
'nieces','priestess','princess','queens','sister','sisters','waitress',
'widow','widows','wife','wives','woman','lady','mistress','queen']

characters_names = get_characters_names(retrieved_characters) #the function returns a list of strings that are (part of) a character's name

custom_stopwords.extend(more_stopwords)
custom_stopwords.extend(characters_names)
#removes words like "mr", "mrs", "mother", "father", etc.

#custom_stopwords.extend(gendered_stopwords) 

Master list of book titles in the dataset.

In [60]:
book_titles = [Path(file).stem for file in files]

**Preprocess the texts in the dataset.**

```
training_data = []

for file in tqdm(files):

    whole_text = open(file, 'r', encoding='utf-8').read() 

    # create list of pos tagged sentences of the text
    pos_tagged_sentences = [nltk.pos_tag(nltk.word_tokenize(sent)) for sent in nltk.sent_tokenize(whole_text)]
    
    # return all the nouns in the book, except custom stopwords
    nouns_in_book = []
    for l in pos_tagged_sentences:
        for (w, pos) in l:
            word = w.lower() # <-- re-define word as lowercase word!!
            if pos[0] == 'N' and word not in custom_stopwords and word.isascii() and not word.isnumeric():
                nouns_in_book.append(word)
    
    n = 1000 # number of nouns to include in each chunk
    # chunks the list of nouns for each book into x sublists with n items
    sublists_of_nouns = [nouns_in_book[i:i + n] for i in range(0, len(nouns_in_book), n)]

    chunk_pointer = 0
    for l in sublists_of_nouns:
        book_chunk = " ".join(l) # transform list to string
        training_data.append(book_chunk) #append the chunk to training data
    
        #write the book chunk to a file
        with open(f"training_data/{Path(file).stem}_chunk_{chunk_pointer}.txt", 'w', encoding='utf-8') as f:
            f.write(book_chunk)
        chunk_pointer += 1
```

Given that processing the texts is a time-consuming operation and we might want to train our model several times in order to chose the best parameters, we stored the output of the text processing in files inside training_data. We can use the following function to re-create a variable for each chunk, retrieving the ready-to-use texts directly, i.e. skipping the processing phase, the next time we use this program.

In [61]:
#re-create the training data variable from saved files

training_data = []
processed_files_paths = glob.glob("training_data/*.txt") #creates a list with the file paths of the processed texts
for processed_text in processed_files_paths:
    ready_text = open(processed_text, 'r', encoding='utf-8').read()
    training_data.append(ready_text)

## Training the topic model

We set the parameters for training the model with `little_mallet_wrapper.quick_train_topic_model()`. We must specify the path where to save the output files. Most importantly, we must choose the number of topics (`num_topics`) to be "discovered" inside our corpus: this is the only parameter we have control on in the training phase. Since there is no automatic solution to choose this parameter, trial and error seems the best strategy. We start by setting `num_topics = 40`, and later we will try increasing and decreasing the value of this parameter, to see which of these values gives the best, i.e. most interpretable, results.

In [62]:

#specify the number of topics we want returned
num_topics = 40

#set desired output directory/ies (will be created inside current directory)
output_directory_path = 'topic-model-output/all_results'

#set topic model output files! No need to change anything below here
Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

path_to_training_data           = f"{output_directory_path}/training.txt"
path_to_formatted_training_data = f"{output_directory_path}/mallet.training"
path_to_model                   = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys              = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions     = f"{output_directory_path}/mallet.topic_distributions.{str(num_topics)}"



Train the model with `little_mallet_wrapper.quick_train_topic_model()`.

In [63]:
"""
little_mallet_wrapper.quick_train_topic_model(path_to_mallet,
                                             output_directory_path,
                                             num_topics,
                                             training_data)

"""

'\nlittle_mallet_wrapper.quick_train_topic_model(path_to_mallet,\n                                             output_directory_path,\n                                             num_topics,\n                                             training_data)\n\n'

Let's train the model again, this time setting 20, 30, 50, 80 and 100 as the number of topics to get.

In [64]:
"""
num_topics = 20
little_mallet_wrapper.quick_train_topic_model(path_to_mallet,
                                             output_directory_path,
                                             20,
                                             training_data)


#30 topics
num_topics = 30
little_mallet_wrapper.quick_train_topic_model(path_to_mallet,
                                             output_directory_path,
                                             num_topics,
                                             training_data)
#50 topics
num_topics = 50
little_mallet_wrapper.quick_train_topic_model(path_to_mallet,
                                             output_directory_path,
                                             num_topics,
                                             training_data)

#80 topics
num_topics = 80
little_mallet_wrapper.quick_train_topic_model(path_to_mallet,
                                             output_directory_path,
                                             num_topics,
                                             training_data)


#100 topics
num_topics = 100
little_mallet_wrapper.quick_train_topic_model(path_to_mallet,
                                             output_directory_path,
                                             num_topics,
                                             training_data)

"""


'\nnum_topics = 20\nlittle_mallet_wrapper.quick_train_topic_model(path_to_mallet,\n                                             output_directory_path,\n                                             20,\n                                             training_data)\n\n\n#30 topics\nnum_topics = 30\nlittle_mallet_wrapper.quick_train_topic_model(path_to_mallet,\n                                             output_directory_path,\n                                             num_topics,\n                                             training_data)\n#50 topics\nnum_topics = 50\nlittle_mallet_wrapper.quick_train_topic_model(path_to_mallet,\n                                             output_directory_path,\n                                             num_topics,\n                                             training_data)\n\n#80 topics\nnum_topics = 80\nlittle_mallet_wrapper.quick_train_topic_model(path_to_mallet,\n                                             output_directory_path,\n          

## Analysis of the results

After having trained the model with 20, 30, 40 and 50 as the number of topics to return, we analyze the resulting topics. Our aim is to get the most coherent model, i.e. the model whose topics are most interpretable.

After the analysis we came up with the conclusion that the best value for `num_topics` parameter seems likely to be between 40 and 50. We decide to re-train the model setting it to 45. 

In [65]:
"""
#45 topics
num_topics = 45
little_mallet_wrapper.quick_train_topic_model(path_to_mallet,
                                             output_directory_path,
                                             num_topics,
                                             training_data)
"""

'\n#45 topics\nnum_topics = 45\nlittle_mallet_wrapper.quick_train_topic_model(path_to_mallet,\n                                             output_directory_path,\n                                             num_topics,\n                                             training_data)\n'

### The 45 topics discovered in our corpus. 

In [187]:
num_topics = 45
path_to_model = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions = f"{output_directory_path}/mallet.topic_distributions.{str(num_topics)}"

topic_keys = little_mallet_wrapper.load_topic_keys(path_to_topic_keys) # list of lists of strings
print(len(topic_keys))
topics_data = []

for idx, topic in enumerate(topic_keys):
    topic_dict = {}
    topic_dict['topic_index'] = idx
    topic_dict['topic_words'] = ", ".join(topic)
    topics_data.append(topic_dict)

with open('45_topic_keys.csv', 'w') as csv_file:
    
    fields = ['topic_index', 'topic_words']
    writer = csv.DictWriter(csv_file, fieldnames=fields)
    writer.writeheader()
    for row in topics_data:
        writer.writerow(row)
    

topics_df = pd.DataFrame(topics_data)
topics_df

45


Unnamed: 0,topic_index,topic_words
0,0,"face, look, voice, something, word, mind, bit,..."
1,1,"men, oxford, town, rooms, year, life, work, ri..."
2,2,"horses, road, day, whip, men, gentleman, cours..."
3,3,"child, day, face, children, heart, life, tears..."
4,4,"board, men, guns, ships, vessel, shot, crew, b..."
5,5,"school, term, house, fellows, door, right, stu..."
6,6,"yer, money, street, shop, purse, face, door, b..."
7,7,"church, men, books, sunday, soul, minister, se..."
8,8,"boys, school, study, form, room, fellows, fell..."
9,9,"night, door, nothing, house, something, anythi..."


## The topics distribution in our corpus

To preserve significant contextual information, we have trained our topic model on the set of *chunked* the books, where each book was divided in chunks of up to 1000 tokens, and each chunk represented a document. 

Now we need to apply the model trained in this way to the very same content (the nouns in each book), but divided in units that are significant for the interpretation of the results: we need to map the topics to each of our book (not to each chunk).

In order to do so, we first pre-process the books, removing stopwords and extracting the nouns only, without chunking them. Then we apply our best model to the new training data (a list of strings where each string consists in the *whole*, cleaned, book).


Preprocess the **whole** texts, repeating the criteria chosen above, but without chunking them into equally sized sections: each document in the new model will be a string of all the nouns - separated by one whitespace character - in each of the original books. Let's also save the output of this second preprocessing phase in files inside the dedicated folder (`new_training_data`), as we did earlier for the text chunks.

```
new_training_data = []

for file in tqdm(files):

    whole_text = open(file, 'r', encoding='utf-8').read() 

    # create list of pos tagged sentences of the text
    pos_tagged_sentences = [nltk.pos_tag(nltk.word_tokenize(sent)) for sent in nltk.sent_tokenize(whole_text)]
    
    # return all the nouns in the book, except custom stopwords
    nouns_in_book_list = []
    for l in pos_tagged_sentences:
        for (w, pos) in l:
            word = w.lower() # <-- re-define word as lowercase word!!
            if pos[0] == 'N' and word not in custom_stopwords and word.isascii() and not word.isnumeric():
                nouns_in_book_list.append(word)
    
    nouns_in_book = " ".join(nouns_in_book_list) #transform list of nouns into string
    new_training_data.append(nouns_in_book) # append all the nouns in the current book to new_training_data
    
    #write the the nouns in the current book to a file
    with open(f"new_training_data/{Path(file).stem}.txt", 'w', encoding='utf-8') as f:
        f.write(nouns_in_book)
```

In [79]:
#re-create the training data variable from saved files

new_training_data = []
processed_files_paths = glob.glob("new_training_data/*.txt") #creates a list with the file paths of the processed texts
for processed_text in processed_files_paths:
    ready_text = open(processed_text, 'r', encoding='utf-8').read()
    new_training_data.append(ready_text)

Set the output paths where to store the results of the inferred model.

In [80]:
num_topics = 45 

path_to_topic_keys = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}" #topics keys are the same for both chunked texts and whole books


path_to_new_training_data = f"{output_directory_path}/results_on_whole_books/training.txt"
path_to_new_formatted_training_data = f"{output_directory_path}/results_on_whole_books/mallet.training"
new_training_data = new_training_data

path_to_new_topic_distributions = f"{output_directory_path}/results_on_whole_books/mallet.topic_distributions.{str(num_topics)}"


Import the training data to feed to the inferring function.

In [81]:
little_mallet_wrapper.import_data(path_to_mallet, path_to_new_training_data, path_to_new_formatted_training_data, new_training_data, training_ids=None, use_pipe_from=path_to_formatted_training_data)

Importing data using pipe...
Complete


Infer the new topic model: we are getting *topic distributions* for a  new set of documents (the *whole* books) using a model that has been trained on another set of documents (the books' *chunks*); nevertheless, the overall content of each of these two dataset is the same.

In [82]:
# apply the model trained on the chunked documents to the actual (whole) books
little_mallet_wrapper.infer_topics(path_to_mallet, path_to_model, path_to_new_formatted_training_data, path_to_new_topic_distributions)

Inferring topics using pre-trained model...
Complete


## Topic distributions over the corpus: results

In [248]:
topic_distributions = little_mallet_wrapper.load_topic_distributions(path_to_new_topic_distributions)

#column_names = [f"topic {topic_keys.index(l)}: " + ', '.join(l) for l in topic_keys]
column_names = [f"topic {topic_keys.index(l)}" for l in topic_keys]
topic_distributions_df = pd.DataFrame(topic_distributions, columns=column_names)


Let's import metadata of our corpus.

In [249]:
corpus_data = pd.read_csv('whole_balanced_corpus.csv')

Now merge the metadata and the topic distributions in one big single DataFrame object, in order to use it for the visualizations.

In [250]:
complete_data_df = pd.concat([corpus_data, topic_distributions_df], axis=1)

In [251]:
import plotly.express as px
import chart_studio
import chart_studio.plotly as csp

In [252]:
username = 'eliarizzetto' # your username
api_key = 'pPHUDola2xKGXkXvr0sq' # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

In [253]:
topic_to_visualize = 'topic 19'

fig = px.area(
    complete_data_df,
    x='year',
    y=f"{topic_to_visualize}",
    color='authors_sex',
    markers=True,
    hover_name='book_title',
    labels={},
    title=f"{topic_to_visualize}",
    color_discrete_sequence=['MediumSeaGreen', 'Tomato'],
)
fig.show()
print(csp.plot(fig, filename = f"{topic_to_visualize}", auto_open=False)) #create a file on plotly chart studio account and a link in this notebook


https://plotly.com/~eliarizzetto/20/


In [245]:
topic_to_visualize = 'topic 31'

fig = px.bar(
    complete_data_df, 
    x="decade", 
    y=f"{topic_to_visualize}",
    color='authors_sex', 
    barmode='group',
    hover_name='book_title', 
    height=400,
    color_discrete_sequence=['MediumSeaGreen', 'Tomato'],
    title=f"{topic_to_visualize}"
    )
fig.show()

print(csp.plot(fig, filename = f"{topic_to_visualize}", auto_open=False)) #create a file on plotly chart studio account and a link in this notebook


https://plotly.com/~eliarizzetto/22/


## Topic interpretation

I have tried to label as much topics as I could, based on the interpretation of the set of words in each topic and on the arguably likely themes to find in XIX century literature for children.

In [256]:


labelled_topics = pd.read_csv('labelled_topics.csv')
pprint(labelled_topics)

    Topic index                                              Label  \
0             2                                    Topic 2: Horses   
1             4                          Topic 4: Navy and pirates   
2             6                               Topic 6: Street life   
3             7                       Topic 7: Church and religion   
4             8                       Topic 8: School and studying   
5             9                        Topic 9: Night and sleeping   
6            16                       Topic 16: Sea and navigation   
7            17                       Topic 17: House and interior   
8            19                                 Topic 19: Open air   
9            20  Topic 20: Indian Americans and life in the wil...   
10           25                  Topic 25: Feelings and sentiments   
11           27                                   Topic 27: Nation   
12           31                           Topic 31: Armed conflict   
13           31     