In [None]:
# import the nltk package and download the stop words
import nltk
nltk.download('stopwords')

# import other useful language processing tools
import spacy
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser
from gensim.parsing.preprocessing import strip_short
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

# import packages for visualization
!pip install seaborn==0.9.0
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
from sklearn.manifold import TSNE

# import other useful packages 
import re
import pandas as pd
import numpy as np

In [None]:
# load in data
poetry_dataset = pd.read_csv('poetry.csv')
poetry_dataset.head()

In [None]:
def stop_word_extend(extension_list, extension_file):
    '''Function to extend the default nltk stopwords list
    
    Arguments
    ---------
    extension_list: a list of words to extend the list of stop words
    extension_file: a .txt file that contains a list of words to extend the list of stop words
    
    Returns:
    --------
    en_stop_words: the final list of stop words
    '''
    en_stop_words = stopwords.words('english')
    en_stop_words.extend(extension_list)
    more_stopwords = open(extension_file).read().split()
    en_stop_words.extend(more_stopwords)
    en_stop_words = set(en_stop_words)
    return en_stop_words

# the following domain specific stopwords are added to the stopwords list 
extension_list = ['thy', 'thou', 'let', 'thee', 'thine', 'thyself', 'tis', 'doth',
                  'upon', 'till', 'unto', 'hath', 'ye', '-PRON-', 'shalt']

# set up and print out the resulting stopwords list
stopword_list = set(stop_word_extend(extension_list, 'stopwords_extend.txt'))

print("the total number of stopwords: {}".format(len(stopword_list)))
list(stopword_list)[:10]

In [None]:
def text_prepare(text, stop_word_extension_list, extension_list):
    '''Function to preprocess each poetry of all the poetires listed in the content column of the dataset
    
    Arguments
    ---------
    text: a string represents individual poetry
    stop_word_extension_list: a list of words to extend the list of stop words
    extension_list: a .txt file that contains a list of words to extend the list of stop words
    
    Returns
    -------
    text: preprocessed text of individual poetry
    '''
    text = text.lower().strip() # strip spaces before and after each line of texts
    # extract the lemma of words so that words can be reduced to their basic forms
    text = " ".join([token.lemma_ for token in nlp(text)]) 
    text = " ".join([word for word in simple_preprocess(text, deacc=True)]) # remove punctuations
    text = " ".join([word for word in text.split() if word not in stop_word_extend(stop_word_extension_list, 
      extension_list)]) # remove stopwords
    text = re.sub(r'\s*lov\s', 'love', text) 
    text = strip_short(text) # remove words that are too short
    return text

# function to preprocess all the poetries and get rid of problematic rows that 
# contain copyright information 
def get_text_prepared(poetry_dataset, stop_word_extension_file, extension_list):
    '''Function to preprocess all the poetries and get rid of problematic rows that contain copyright information
    
    Arguments
    ---------
    poetry_dataset: the .csv file that is the poetry dataset
    stop_word_extension_file: a .txt file that contains a list of words to extend the list of stop words
    extension_list: a list of words to extend the list of stop words
    
    Returns
    -------
    text_data: a list containing list of words from each poetry after the preprocessing
    poetry_dataset_reduced: a reduced version of the poetry dataset after the rows containing the copyright information have been deleted
    '''
    text_data = []
    good_index = []
    for i, poetry in enumerate(poetry_dataset['content']):
        if any(x in ['copyright', 'permission', 'published'] for x in poetry.lower().split()):
            continue
        text_data.append(text_prepare(poetry, extension_list, 
          stop_word_extension_file).split())
        good_index.append(i)
    poetry_dataset_reduced = poetry_dataset.iloc[good_index, :]
    return text_data, poetry_dataset_reduced

In [None]:
# preprocess all poetries
nlp = spacy.load('en', disable=['parser', 'ner'])
text_data, poetry_dataset_reduced = get_text_prepared(poetry_dataset,
                                                      'stopwords_extend.txt',
                                                      extension_list)

# print out a comparison between the preprocessed and raw text 
pprint.pprint(' '.join(text_data[0]))
print(poetry_dataset['content'][0])

In [None]:
def make_ngrams(text_data, min_count, threshold):
    '''Function to combine words together to form phrases so that the modeling afterwards can learn some context.
    
    Arguments
    ---------
    text_data: list of documents each of which is a list of words
    min_count:
    threshold:
    
    Returns
    -------
    text_data: list of documents each of which is a list of words and trigrams
    '''
    bigram = Phrases(text_data, min_count=min_count, threshold=threshold)
    trigram = Phrases(bigram[text_data], threshold=threshold)

    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)

    def make_bi_tri_grams(text_data):
        text_data = [bigram_mod[doc] for doc in text_data]
        text_data = [trigram_mod[doc] for doc in text_data]
        return text_data

    text_data = make_bi_tri_grams(text_data=text_data)
    return text_data

In [None]:
def get_gensim_corpus(text_data):
    '''Function to get the dictionary and corpus for modeling. 
    
    Arguments
    ---------
    text_data: list of documents each of which is a list of words
    
    Returns
    -------
    corpus: list of documents each of which consists of tuple of word in their integer representation and their counts in the document
    dictionary: the mapping between words and their integer ids
    '''
    dictionary = corpora.Dictionary(text_data)
    dictionary.filter_extremes(no_below=5, no_above=0.2)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    return corpus, dictionary

In [None]:
text_data_grams = make_ngrams(text_data, 5, 70)
corpus, dictionary = get_gensim_corpus(text_data_grams)

In [None]:
n_topics_search_range = np.arange(2, 8, 1) # the number of topics to try
coherence_scores = [] # list to store the coherence scores
models = [] # list to store the LDA models 

# loop to fit LDA models of different number of topics
for num_loops, n_topics in enumerate(n_topics_search_range):
    print("training LDA model with %d topics" % (n_topics))
    # fit the model
    topic_model_LDA = LdaModel(corpus=corpus, id2word=dictionary,
                               num_topics=n_topics, random_state=10,
                               update_every=1, chunksize=80,
                               passes=20, alpha='auto',
                               per_word_topics=True)
    
    # compute the coherence scores 
    coherence_score_LDA = CoherenceModel(model=topic_model_LDA, 
                                         texts=text_data, coherence='c_v', 
                                         dictionary=dictionary).get_coherence()
    
    # store the coherence scores and models 
    coherence_scores.append(coherence_score_LDA)
    models.append(topic_model_LDA)

# convert the coherence scores into a dataframe for plotting 
coherence_scores_data = pd.DataFrame(coherence_scores, index=n_topics_search_range)


In [None]:
plt.figure(figsize=(11, 9))
sns.set_style('white')
sns.lineplot(data=coherence_scores_data)
plt.xlabel("the number topics trained")
plt.ylabel("coherence score")
plt.show()

In [None]:
model_used = models[1][1]
pprint.pprint(model_used.print_topics())

In [None]:
# !pip install pyLDAvis
import pyLDAvis.gensim


pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model_used, corpus, dictionary)
vis

The above visualization contains the following elements:
* The bubbles on the left shows the marginal distribution of each topic. This is related to the percentage of tokens it contains. 
* The further away a bubble is from from another bubble the more different between their meanings. Therefore, from the above visualization we can see that topic 1 and 2 have similar meanings. 
* From the bar graph on the right we can see the information about each word. The red bar indicates the frequency of the token within the topic and the light blue overall frequency within the corpus. 
* Furthermore, we can adjust the relevance metric to get more information about a certain topic by ranking the tokens not only by its frequency within the topic but ??

In [None]:
doc_topic = []
doc_dist_list = []
for document in corpus:
    doc_dist = model_used.get_document_topics(document, minimum_probability=0)
    doc_dist_list.append(list(map(lambda x: x[1], doc_dist)))
    doc_dist_sorted = sorted(doc_dist, key=lambda x: x[1], reverse=True)
    doc_topic.append(doc_dist_sorted[0][0])
    
poetry_dataset_reduced['topics'] = doc_topic

tsne_result = TSNE(random_state=10).fit_transform(doc_dist_list)
tsne_result = pd.DataFrame(tsne_result, columns=['coordinate1', 'coordinate2'])

poetry_dataset_reduced.reset_index(inplace=True, drop=True)
poetry_dataset_prepared = pd.concat([poetry_dataset_reduced, tsne_result],
                                    axis=1)
poetry_dataset_prepared.astype({'topics': 'category'})

author_list = ['WILLIAM SHAKESPEARE', 'EDMUND SPENSER', 'WILLIAM BUTLER YEATS',
              'D. H. LAWRENCE', 'JOHN DONNE', 'WALLACE STEVENS']
author_mask = poetry_dataset_prepared.author.isin(author_list)
poetry_dataset_prepared_reduced = poetry_dataset_prepared.loc[author_mask,:]

plt.figure(figsize=(9,7))
sns.countplot(x='topics', hue='age', data=poetry_dataset_prepared)

plt.figure(figsize=(9,7))
num_topics = len(set(poetry_dataset_reduced['topics']))
sns.scatterplot(x='coordinate1', y='coordinate2', style='author',
                hue='topics', data=poetry_dataset_prepared_reduced,
               palette=sns.color_palette("husl", num_topics))

plt.show()