In [219]:
import pandas as pd
import nltk
import pycountry_convert as pc
import numpy as np
import re
import string
import spacy
from gensim import corpora, models, similarities, matutils
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import TruncatedSVD
import pickle

In [224]:
df = pd.read_csv('fairytales.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [225]:
# Divide dataframe into content and information of stories

story = df.body
story_info = df.drop('body', axis = 1)

In [231]:
story_info.head(5)

Unnamed: 0,title,author,region,country
0,A Clever Thief,Hindu Tales from the Sanskrit,Indian,India
1,A Lac of Rupees for a Piece of Advice,Joseph Jacobs,Indian,India
2,A Leaf from the Sky,Hans Christian Andersen,Danish Nordic Scandinavian,Scandinavia
3,A Legend of Confucius,The Chinese Fairy Book,Chinese,China
4,A Lesson for Kings,Joseph Jacobs,Indian,India


# Create continent column

The website provides the regions of the fairy tales, and I need to convert regions to continents.

## Nationality to country name

In [228]:
# use this file to find corresponding country name of each fairy tale

demonyms = pd.read_csv('demonyms.csv', header = None, names = ['nationality','country'])
story_info = story_info.merge(demonyms.rename({'nationality': 'region'},axis=1),on = 'region', how='left')

In [229]:
story_info.loc[(story_info.region == 'Native American North American'), 'country'] = 'North America'
story_info.loc[(story_info.region == 'Canadian Native American North American'), 'country'] = 'North America'
story_info.loc[(story_info.region == 'Native American'), 'country'] = 'North America'

story_info.loc[(story_info.region == 'Danish Nordic Scandinavian'), 'country'] = 'Scandinavia'
story_info.loc[(story_info.region == 'Nordic Scandinavian'), 'country'] = 'Scandinavia'
story_info.loc[(story_info.region == 'Danish Scandinavian'), 'country'] = 'Scandinavia'
story_info.loc[(story_info.region == 'Norwegian Scandinavian'), 'country'] = 'Scandinavia'
story_info.loc[(story_info.region == 'Nordic Norwegian Scandinavian'), 'country'] = 'Scandinavia'
story_info.loc[(story_info.region == 'English Nordic Scandinavian'), 'country'] = 'Scandinavia'

story_info.loc[(story_info.region == 'Korean'), 'country'] = 'Korea'

story_info.loc[(story_info.region == 'Catalan Spanish'), 'country'] = 'Spain'

story_info.loc[(story_info.region == 'Czechoslovak Finnish'), 'country'] = 'Finland'

story_info.loc[(story_info.region == 'Indian Pakistani'), 'country'] = 'Pakistan'

In [230]:
story_info.head(5)

Unnamed: 0,title,author,region,country
0,A Clever Thief,Hindu Tales from the Sanskrit,Indian,India
1,A Lac of Rupees for a Piece of Advice,Joseph Jacobs,Indian,India
2,A Leaf from the Sky,Hans Christian Andersen,Danish Nordic Scandinavian,Scandinavia
3,A Legend of Confucius,The Chinese Fairy Book,Chinese,China
4,A Lesson for Kings,Joseph Jacobs,Indian,India


## Country name to continent

In [77]:
def country_to_continent(country_name):
    """"
    A function that converts country names to continent names.
    """
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
    except:
        return np.nan

In [78]:
story_info['continent_name'] = story_info['country'].apply(lambda x: country_to_continent(x))

In [79]:
story_info.loc[(story_info.country == 'North America'), 'continent_name'] = 'North America'
story_info.loc[(story_info.country == 'Hawaii'), 'continent_name'] = 'North America'
story_info.loc[(story_info.country == 'England'), 'continent_name'] = 'Europe'
story_info.loc[(story_info.country == 'Scotland'), 'continent_name'] = 'Europe'
story_info.loc[(story_info.country == 'Scandinavia'), 'continent_name'] = 'Europe'
story_info.loc[(story_info.country == 'Czechoslovakia'), 'continent_name'] = 'Europe'
story_info.loc[(story_info.country == 'Wales'), 'continent_name'] = 'Europe'
story_info.loc[(story_info.country == 'Cornwall'), 'continent_name'] = 'Europe'
story_info.loc[(story_info.country == 'Africa'), 'continent_name'] = 'Africa'
story_info.loc[(story_info.country == 'Rhodesia'), 'continent_name'] = 'Africa'
story_info.loc[(story_info.country == 'Korea'), 'continent_name'] = 'Asia'

In [80]:
story_info.loc[(story_info.region == 'Arabic'), 'continent_name'] = 'Asia'  # mostly Asia
story_info.loc[(story_info.region == 'Slavic'), 'continent_name'] = 'Europe'  # mostly Europe
story_info.loc[(story_info.region == 'Nordic'), 'continent_name'] = 'Europe'
story_info.loc[(story_info.region == 'Celtic'), 'continent_name'] = 'Asia'
story_info.loc[(story_info.region == 'Sami'), 'continent_name'] = 'Europe'
story_info.loc[(story_info.region == 'Maori'), 'continent_name'] = 'Oceania'
story_info.loc[(story_info.region == 'Bukovinian'), 'continent_name'] = 'Europe'
story_info.loc[(story_info.region == 'Danish Nordic'), 'continent_name'] = 'Europe'
story_info.loc[(story_info.region == 'Serbian Slavic'), 'continent_name'] = 'Europe'
story_info.loc[(story_info.region == 'Unknown'), 'continent_name'] = 'Unknown'

In [81]:
story_info.continent_name.value_counts()

Europe           1919
Asia              584
North America     432
Africa            110
Oceania            60
South America      31
Unknown             4
Name: continent_name, dtype: int64

In [221]:
with open('pickle_files/all_story.pickle', 'wb') as to_write:
    pickle.dump(story_all, to_write)

# Text Preprocessing

## Build a pipeline

In [83]:
class nlp_pipeline:
   

    def __init__(self, vectorizer=CountVectorizer(), sp = None, model = None):
        """
        A class for pipelining NLP cleaning and preprocessing. The user provides a series of 
        tools, and this class manages all of the training, transforming, and modification
        of the text data.
        ---
        Inputs:
        vectorizer: the model to use for vectorization of text data
        sp: Spacy model
        """
        
        self.model = model
        if not sp:
            self.sp = spacy.load('en')
        self.vectorizer = vectorizer
        self._is_fit = False

        
    def clean_text(self, text):
        """
        A function to clean text: remove notes within text, numbers, punctuations, 
        get words to lower case and lemmatize the text.
        It uses spaCy for text preprocessing.
        """

        # remove the notes within text
        remove_note = lambda x: re.sub('{.*}', ' ', x)

        # remove numbers
        alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)

        # remove punctuation, get lower case
        punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation),' ', x.lower())

        # apply the lambda functions above
        text = text.map(remove_note).map(alphanumeric).map(punc_lower)
        
        # lemmatize
        clean_text = []
        for indiv_text in text:
            indiv_text = self.sp(indiv_text) # automatically tokenized
            lemmatized = ' '.join([word.lemma_ for word in indiv_text])
            lemmatized = re.sub('(-PRON-)', '', lemmatized)
            clean_text.append(lemmatized) 

        
        return clean_text
    
    
    def fit(self, text):
        """
        Cleans the data and then fits the vectorizer with
        the user provided text
        """
        clean_text = self.clean_text(text)
        self.vectorizer.fit(clean_text)
        self._is_fit = True
              
        
    def get_feature_names(self):
        """
        Gets the feature names from the vectorizer.
        """
        
        if not self._is_fit:
            raise ValueError("Must fit the models before getting feature names!")
        
        return self.vectorizer.get_feature_names()
    
    def transform(self, text):
        """
        Cleans any provided data and then transforms the data into
        a vectorized format based on the fit function.
        """
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        clean_text = self.clean_text(text)

        return self.vectorizer.transform(clean_text)
    
    
    def save_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        pickle.dump(self.__dict__, open(filename+".mdl", 'wb'))
        
        
    def load_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        if filename[-4:] != '.mdl':
            filename += '.mdl'
        self.__dict__ = pickle.load(open(filename, 'rb'))

The text is cleaned every time I train the pipeline. Cleaning the text takes a really long time for my dataset, so I'm using the two individual functions below when I'm still trying different parameters for CV and TFIDF. After I decide on the parameters of CV or TFIDF, I can train the whole pipeline and save it for future use.

In [84]:
sp = spacy.load('en')

def clean_text(text):
    """
    A function to clean text: remove notes within text, numbers, punctuations, 
    get words to lower case and lemmatize the text.
    It uses spaCy for text preprocessing.
    """
    
    # remove the notes within text
    remove_note = lambda x: re.sub('{.*}', ' ', x)
    
    # remove numbers
    alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)

    # remove punctuation, get lower case
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation),' ', x.lower())

    # apply the lambda functions above
    text = text.map(remove_note).map(alphanumeric).map(punc_lower)

    # lemmatize
    clean_text = []
    for indiv_text in text:
        indiv_text = sp(indiv_text) # automatically tokenized
        lemmatized = ' '.join([word.lemma_ for word in indiv_text])
        lemmatized = re.sub('(-PRON-)', '', lemmatized)
        clean_text.append(lemmatized) 


    return clean_text

def get_doc_term(vectorizer, doc):
    """
    A function that returns the fitted vectorizer, the doc-term matrix and feature names.
    """
    vec = vectorizer.fit(doc)
    matrix = vec.transform(doc)
    term = vec.get_feature_names()
    return vec, matrix, term

In [85]:
clean_story = clean_text(story)

In [220]:
with open('pickle_files/clean_story.pickle', 'wb') as to_write:
    pickle.dump(clean_story, to_write)
with open('pickle_files/story_info.pickle', 'wb') as to_write:
    pickle.dump(story_info, to_write)

## CountVectorizer

Bigram

In [87]:
# This cell trains the pipeline.

# cv1 = nlp_pipeline(CountVectorizer(ngram_range=(2,2), stop_words = 'english', min_df = 3))
# cv1.fit(story)
# cv1_matrix = cv1.transform(story)

In [88]:
cv1, cv1_matrix, cv1_terms = get_doc_term(CountVectorizer(ngram_range=(2,2), stop_words = 'english', min_df = 3), clean_story)

In [89]:
cv1_matrix.shape

(3140, 118016)

Trigram

In [92]:
cv2, cv2_matrix, cv2_terms = get_doc_term(CountVectorizer(ngram_range=(3,3),stop_words = 'english', min_df = 3), clean_story)

In [93]:
cv2_matrix.shape

(3140, 16770)

Bigram and trigram

In [96]:
cv3, cv3_matrix, cv3_terms = get_doc_term(CountVectorizer(ngram_range=(2,3),stop_words = 'english', min_df = 3), clean_story)

In [97]:
cv3_matrix.shape

(3140, 134786)

## TFIDF

Bigram  
Setting min_df=3 can remove really rare words, especially words from other languages

In [99]:
tf1, tf1_matrix, tf1_terms = get_doc_term(TfidfVectorizer(ngram_range=(2,2),stop_words = 'english', min_df = 3), clean_story)

In [100]:
tf1_matrix.shape

(3140, 118016)

Trigram

In [103]:
tf2, tf2_matrix, tf2_terms = get_doc_term(TfidfVectorizer(ngram_range=(3,3),stop_words = 'english', min_df = 3), clean_story)

In [104]:
tf2_matrix.shape

(3140, 16770)

Bigram and trigram

In [107]:
tf3, tf3_matrix, tf3_terms = get_doc_term(TfidfVectorizer(ngram_range=(2,3),stop_words = 'english', min_df = 3), clean_story)

In [108]:
tf3_matrix.shape

(3140, 134786)

Bigram. Also set max_df = 20 to exclude most frequent words.

In [109]:
tf4, tf4_matrix, tf4_terms = get_doc_term(TfidfVectorizer(ngram_range=(2,2),stop_words = 'english', min_df = 3, max_df = 20), clean_story)

In [110]:
tf4_matrix.shape

(3140, 111868)

Bigram. Try max_df = 10 too

In [111]:
tf5, tf5_matrix, tf5_terms = get_doc_term(TfidfVectorizer(ngram_range=(2,2),stop_words = 'english', min_df = 3, max_df = 10), clean_story)

In [112]:
tf5_matrix.shape

(3140, 101287)

Bigram and trigram. max_df = 10

In [113]:
tf6, tf6_matrix, tf6_terms = get_doc_term(TfidfVectorizer(ngram_range=(2,3),stop_words = 'english', min_df = 3, max_df = 10), clean_story)

In [114]:
tf6_matrix.shape

(3140, 117396)

Unigram. max_df = 10

In [115]:
tf7, tf7_matrix, tf7_terms = get_doc_term(TfidfVectorizer(ngram_range=(1,1),stop_words = 'english', min_df = 3, max_df = 20), clean_story)

In [116]:
tf7_matrix.shape

(3140, 8941)

Unigram and bigram

In [174]:
tf8, tf8_matrix, tf8_terms = get_doc_term(TfidfVectorizer(ngram_range=(1,2),stop_words = 'english', min_df = 3), clean_story)

In [175]:
tf8_matrix.shape

(3140, 131774)

# Topic Modeling

## LDA

LDA is too slow. Use LDAMulticore instead.

In [117]:
def LDA_model(vectorizer, doc_term, num_topics, passes, text = story):
    """
    A function for LDA topic modeling.
    Inputs: pipeline of vectorizer, text, number of topics, number of passes
    Output: LDA model and a list of generated topics.
    """
    
    doc_word = doc_term.transpose()
    corpus = matutils.Sparse2Corpus(doc_word)
    id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())
    lda = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=passes)
    topic_list = lda.print_topics()
    return lda, topic_list

## LSA

Only using tfidf as recommended

In [120]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    """
    Displays the top words in each topic.
    """
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [121]:
def LSA_model(doc_term, num_topics):
    """
    A function that trains LSA model. 
    Input: document-term matrix and number of topics.
    Output: LSA model, document topic matrix, explained variance, topic term matrix.
    """
    lsa = TruncatedSVD(num_topics)
    doc_topic = lsa.fit_transform(doc_term)
    variance = lsa.explained_variance_ratio_
    topic_term = lsa.components_
    return lsa, doc_topic, variance, topic_term

In [122]:
lsa_tf1_1, doc_topic_tf1_1, variance_tf1_1, topic_term_tf1_1 = LSA_model(doc_term = tf1_matrix, num_topics = 5)
display_topics(lsa_tf1_1, tf1.get_feature_names(), 5)


Topic  0
old man, old woman, young man, say old, king son

Topic  1
old man, say old, old woman, war eagle, man old

Topic  2
old woman, little old, woman say, woman come, say old

Topic  3
ou jackalse, ou wolf, se ou, old woman, dat se

Topic  4
young man, say young, man tell, young woman, handsome young


In [123]:
lsa_tf1_2, doc_topic_tf1_2, variance_tf1_2, topic_term_tf1_2 = LSA_model(doc_term = tf1_matrix, num_topics = 8)
display_topics(lsa_tf1_2, tf1.get_feature_names(), 8)


Topic  0
old man, old woman, young man, say old, king son, say king, king daughter, man say

Topic  1
old man, say old, old woman, war eagle, man old, good old, man tell, mountain lion

Topic  2
old woman, little old, woman say, woman come, say old, poor old, woman sit, woman tell

Topic  3
ou jackalse, ou wolf, se ou, old woman, dat se, ou baviyàan, old hendrik, dis time

Topic  4
young man, say young, man tell, young woman, son king, handsome young, son law, white beaver

Topic  5
king son, say king, king daughter, old man, thou hast, say giant, thou art, thou wilt

Topic  6
st peter, peter say, young man, thou hast, say st, poor man, say lord, thou art

Topic  7
mr fox, mrs fox, king son, mr coyote, fox come, say mr, prairie dog, young man


In [124]:
lsa_tf1_3, doc_topic_tf1_3, variance_tf1_3, topic_term_tf1_3 = LSA_model(doc_term = tf1_matrix, num_topics = 10)
display_topics(lsa_tf1_3, tf1.get_feature_names(), 10)


Topic  0
old man, old woman, young man, say old, king son, say king, king daughter, man say, long time, come home

Topic  1
old man, say old, old woman, war eagle, man old, good old, man tell, mountain lion, time old, man make

Topic  2
old woman, little old, woman say, woman come, say old, poor old, woman sit, woman tell, man old, woman beg

Topic  3
ou jackalse, ou wolf, se ou, old woman, dat se, old hendrik, ou baviyàan, dis time, yust dat, like dat

Topic  4
young man, say young, man tell, son law, young woman, son king, handsome young, ju ju, white beaver, snuff box

Topic  5
king son, king daughter, say king, old man, say giant, thou art, thou hast, thou wilt, son king, king say

Topic  6
st peter, peter say, young man, say st, say lord, thou hast, poor man, lord say, thou wilt, thou art

Topic  7
mr fox, king son, mrs fox, mr coyote, say mr, fox come, prairie dog, bold bold, poor mr, fox jump

Topic  8
thou hast, thou art, poor man, thou wilt, hast thou, dost thou, thou shalt, 

In [125]:
################  The best LSA Model  ################

lsa_tf1_4, doc_topic_tf1_4, variance_tf1_4, topic_term_tf1_4 = LSA_model(doc_term = tf1_matrix, num_topics = 15)
display_topics(lsa_tf1_4, tf1.get_feature_names(), 15)

# Identify topics:
# 0:human kingdom; 1: people&animals; 2: poverty; 3:people&animals; 4: war; 5.kingdom; 6: religion; 7: animal characters;
# 8: rich vs. poor; 9: family; 10: rich vs. poor; 11: rich vs. poor; 12: people&animals; 13: kingdom; 14:norse mythology


Topic  0
old man, old woman, young man, say old, king son, say king, king daughter, man say, long time, come home, man come, king say, run away, come say, little girl

Topic  1
old man, say old, old woman, war eagle, man old, good old, man tell, mountain lion, time old, man make, tell old, duck people, man say, day old, bad old

Topic  2
old woman, little old, woman say, woman come, say old, poor old, woman sit, woman tell, man old, head cook, woman beg, woman little, woman look, woman live, woman ask

Topic  3
ou jackalse, ou wolf, se ou, old woman, dat se, old hendrik, ou baviyàan, dis time, yust dat, like dat, king lion, jackalse yust, darie ou, dat ou, se dat

Topic  4
young man, say young, man tell, ju ju, young woman, son law, handsome young, son king, white beaver, snuff box, man kill, man reply, tell young, bow arrow, man wife

Topic  5
king son, say king, king daughter, old man, say giant, thou hast, thou art, son king, thou wilt, king say, yellow lily, son say, say thou, tho

In [214]:
lsa_model = [lsa_tf1_4, doc_topic_tf1_4, variance_tf1_4, topic_term_tf1_4]

with open('pickle_files/lsa_bestmodel.pickle', 'wb') as to_write:
    pickle.dump(lsa_model, to_write)

In [215]:
with open('pickle_files/lsa_doc_term.pickle', 'wb') as to_write:
    pickle.dump(tf1_matrix, to_write)

In [232]:
with open('pickle_files/tfidf.pickle', 'wb') as to_write:
    pickle.dump(tf1, to_write)

In [126]:
lsa_tf2_1, doc_topic_tf2_1, variance_tf2_1, topic_term_tf2_1 = LSA_model(doc_term = tf2_matrix, num_topics = 5)
display_topics(lsa_tf2_1, tf2.get_feature_names(), 5)


Topic  0
say old man, old man say, say old woman, little old woman, old woman say

Topic  1
little old woman, old woman come, say little old, old woman say, monkey think trick

Topic  2
se ou jackalse, se ou wolf, ou jackalse yust, ou jackalse dat, ou jackalse ou

Topic  3
face blind man, potato face blind, little old woman, say potato face, village liver onion

Topic  4
say old man, little old woman, old man say, old man tell, old man come


In [127]:
lsa_tf2_2, doc_topic_tf2_2, variance_tf2_2, topic_term_tf2_2 = LSA_model(doc_term = tf2_matrix, num_topics = 8)
display_topics(lsa_tf2_2, tf2.get_feature_names(), 8)


Topic  0
say old man, old man say, say old woman, little old woman, old woman say, old man tell, old man come, old woman come

Topic  1
little old woman, say little old, old woman come, monkey think trick, old woman say, old woman make, old woman stand, old woman angry

Topic  2
se ou jackalse, se ou wolf, ou jackalse yust, ou jackalse dat, ou jackalse ou, dat se ou, darie ou jackalse, ou wolf dat

Topic  3
potato face blind, face blind man, little old woman, say potato face, village liver onion, near post office, corner near post, blind man sit

Topic  4
say old man, little old woman, old man say, old man tell, old man come, old man make, tell old man, time old man

Topic  5
draw nigh unto, thou hast spoken, spoken word ssarwala, sack air ssidi, hast spoken word, burst sack air, word ssarwala missdood, destiny thou hast

Topic  6
say old woman, old woman say, old man daughter, old woman come, say old man, old man old, poor old woman, old woman daughter

Topic  7
ha ha ha, ha ha laugh

In [128]:
lsa_tf2_3, doc_topic_tf2_3, variance_tf2_3, topic_term_tf2_3 = LSA_model(doc_term = tf2_matrix, num_topics = 10)
display_topics(lsa_tf2_3, tf2.get_feature_names(), 10)


Topic  0
say old man, old man say, say old woman, little old woman, old woman say, old man tell, old man come, old woman come, old man old, say young man

Topic  1
little old woman, old woman come, say little old, old woman say, monkey think trick, old woman make, old woman stand, old woman angry, old woman know, hard hard blow

Topic  2
se ou jackalse, se ou wolf, ou jackalse yust, ou jackalse dat, ou jackalse ou, dat se ou, darie ou jackalse, ou wolf dat, dat ou wolf, se se ou

Topic  3
face blind man, potato face blind, little old woman, say potato face, village liver onion, corner near post, near post office, blind man sit, ask potato face, blind man begin

Topic  4
say old man, little old woman, old man say, old man tell, old man come, old man make, tell old man, time old man, ask old man, old man begin

Topic  5
draw nigh unto, thou hast spoken, spoken word ssarwala, sack air ssidi, hast spoken word, burst sack air, word ssarwala missdood, ruler destiny thou, destiny thou hast, 

In [129]:
lsa_tf3_1, doc_topic_tf3_1, variance_tf3_1, topic_term_tf3_1 = LSA_model(doc_term = tf3_matrix, num_topics = 5)
display_topics(lsa_tf3_1, tf3.get_feature_names(), 5)


Topic  0
old man, old woman, young man, say old, king son

Topic  1
old man, say old man, say old, old man say, war eagle

Topic  2
ou jackalse, ou wolf, se ou, se ou jackalse, se ou wolf

Topic  3
old woman, ou jackalse, ou wolf, se ou, say old woman

Topic  4
young man, say young man, say young, young man say, young man tell


In [130]:
lsa_tf3_2, doc_topic_tf3_2, variance_tf3_2, topic_term_tf3_2 = LSA_model(doc_term = tf3_matrix, num_topics = 8)
display_topics(lsa_tf3_2, tf3.get_feature_names(), 8)


Topic  0
old man, old woman, young man, say old, king son, say king, king daughter, man say

Topic  1
old man, say old man, say old, old man say, war eagle, old woman, old man tell, man old

Topic  2
ou jackalse, ou wolf, se ou, se ou jackalse, se ou wolf, old man, dat se, old hendrik

Topic  3
old woman, ou jackalse, ou wolf, se ou, say old woman, little old woman, old woman say, se ou jackalse

Topic  4
young man, say young man, say young, young man say, young man tell, young man come, ju ju, son law

Topic  5
king son, say king, king daughter, say king son, old man, say giant, young man, son king

Topic  6
st peter, st peter say, peter say, young man, say st peter, say st, say lord, king son

Topic  7
mr fox, king son, mrs fox, mr coyote, say mr fox, young man, say mr, mr fox come


In [131]:
lsa_tf3_3, doc_topic_tf3_3, variance_tf3_3, topic_term_tf3_1 = LSA_model(doc_term = tf3_matrix, num_topics = 10)
display_topics(lsa_tf3_3, tf3.get_feature_names(), 10)


Topic  0
old man, old woman, young man, say old, king son, say king, king daughter, man say, long time, come home

Topic  1
old man, say old man, say old, old man say, war eagle, old woman, old man tell, man old, old man come, old man old

Topic  2
ou jackalse, ou wolf, se ou, se ou jackalse, se ou wolf, old man, dat se, old hendrik, ou baviyàan, dis time

Topic  3
old woman, ou jackalse, ou wolf, se ou, say old woman, little old woman, old woman say, se ou jackalse, little old, woman say

Topic  4
young man, say young man, say young, young man say, young man tell, young man come, man tell, young man reply, handsome young, young woman

Topic  5
king son, say king, king daughter, say king son, old man, say giant, thou hast, thou art, young man, son king

Topic  6
st peter, st peter say, peter say, young man, say st peter, say st, say lord, thou hast, lord say, poor man

Topic  7
mr fox, mrs fox, mr coyote, king son, say mr fox, say mr, fox come, mr fox come, prairie dog, young man

Top

In [132]:
lsa_tf4_1, doc_topic_tf4_1, variance_tf4_1, topic_term_tf4_1 = LSA_model(doc_term = tf4_matrix, num_topics = 10)
display_topics(lsa_tf4_1, tf4.get_feature_names(), 10)


Topic  0
ou jackalse, ou wolf, se ou, dat se, old hendrik, ou baviyàan, dis time, yust dat, king lion, like dat

Topic  1
mr fox, mrs brien, black fellow, say fin, prince ivan, man eater, charcoal burner, ju ju, rich brother, wild man

Topic  2
mr fox, mrs fox, mr coyote, prairie dog, bold bold, poor mr, fox jump, dance hard, old mr, soon mr

Topic  3
black fellow, eagle hawk, drive spear, fellow live, spear black, dead emu, place black, piece bark, sit bush, dog bite

Topic  4
bonne biche, beau minon, biche beau, dear blondine, say blondine, forest lilacs, say fin, blondine say, blondine know, blondine enter

Topic  5
ma ui, bonne biche, son chan, beau minon, wife chan, ui make, fish hook, prince ivan, say unto, lift sky

Topic  6
son chan, wife chan, say fin, say unto, chan say, draw nigh, nigh unto, fenians erin, mrs brien, fin maccumhail

Topic  7
ju ju, say fin, fenians erin, fin maccumhail, ju man, water ju, raja rasâlu, man eater, fin say, fin man

Topic  8
ju ju, ju man, water

In [133]:
lsa_tf5_1, doc_topic_tf5_1, variance_tf5_1, topic_term_tf5_1 = LSA_model(doc_term = tf5_matrix, num_topics = 10)
display_topics(lsa_tf5_1, tf5.get_feature_names(), 10)


Topic  0
ou jackalse, ou wolf, ou baviyàan, jackalse yust, king lion, darie ou, dat ou, se dat, look ou, den ou

Topic  1
say fin, prince ivan, bonne biche, golden horse, ma ui, beau minon, glass mountain, fenians erin, white bear, fin maccumhail

Topic  2
bonne biche, beau minon, biche beau, dear blondine, say blondine, forest lilacs, blondine say, blondine know, blondine enter, evil genius

Topic  3
ma ui, ui make, fish hook, lift sky, ui say, let line, great island, ui let, say ma, sacred bird

Topic  4
say fin, fenians erin, fin maccumhail, fin say, fin man, chew thumb, ask fin, conan maol, fin castle, come fin

Topic  5
mr coyote, prairie dog, water jar, little hen, end stick, fruit water, note san, hopi mesa, note hopi, home river

Topic  6
father horrigan, say dermod, dermod leary, save day, fairy scamper, priest supper, fine salmon, civil question, hear priest, horrigan tell

Topic  7
prince ivan, little hen, little cock, mrs fox, linden tree, rise red, baba yaga, spring drop,

In [134]:
lsa_tf6_1, doc_topic_tf6_1, variance_tf6_1, topic_term_tf6_1 = LSA_model(doc_term = tf6_matrix, num_topics = 10)
display_topics(lsa_tf6_1, tf6.get_feature_names(), 10)


Topic  0
ou jackalse, ou wolf, se ou jackalse, se ou wolf, ou baviyàan, king lion, ou jackalse yust, jackalse yust, darie ou, se dat

Topic  1
say fin, prince ivan, bonne biche, ma ui, beau minon, glass mountain, fenians erin, golden horse, fin maccumhail, ivan tsarevich

Topic  2
bonne biche, beau minon, bonne biche beau, biche beau, biche beau minon, dear blondine, say blondine, forest lilacs, blondine say, blondine know

Topic  3
ma ui, ui make, ma ui make, fish hook, lift sky, ma ui say, ui say, let line, great island, ui let

Topic  4
say fin, fenians erin, fin maccumhail, fin say, fin man, chew thumb, ask fin, conan maol, fin castle, come fin

Topic  5
mr coyote, father horrigan, prairie dog, say dermod, say father horrigan, water jar, ivan tsarevich, dermod leary, mrs fox, save day

Topic  6
father horrigan, say dermod, say father horrigan, dermod leary, save day, fairy scamper, priest supper, fine salmon, civil question, hear priest

Topic  7
raja rasâlu, play chaupur, king sa

In [135]:
lsa_tf7_1, doc_topic_tf7_1, variance_tf7_1, topic_term_tf7_1 = LSA_model(doc_term = tf7_matrix, num_topics = 10)
display_topics(lsa_tf7_1, tf7.get_feature_names(), 10)


Topic  0
coyote, iktomi, antler, slime, gum, bunny, skate, curlew, taos, mesa

Topic  1
dat, ou, se, jackalse, yust, dere, ole, wid, dey, dis

Topic  2
chan, ssidi, tângâri, ssarwala, pincer, carver, spoken, pagoda, missdood, jakzang

Topic  3
violette, ourson, agnella, passerose, drolette, aimee, venom, superintendent, nonchalante, indolent

Topic  4
tsarevich, tsarevna, squire, heifer, tsarina, ju, sigurd, pood, kirtle, tsaritsa

Topic  5
perseus, gorgon, medusa, andromeda, danaë, argos, athené, sigurd, polydecte, argo

Topic  6
sigurd, brynhild, fafnir, gunnar, regin, sigmund, grani, gudrun, volsung, blondine

Topic  7
blondine, biche, bonne, minon, beau, gourmandinet, lilacs, brunette, benin, fourbette

Topic  8
ju, calabar, foo, alligator, iktomi, effiong, hedgehog, eyo, outa, palaver

Topic  9
raja, rasâlu, sarkap, brahman, vizier, fakir, jôgi, rupee, chaupur, sâlbâhan


# LDAMulticore

In [136]:
def LDAMulticore_model(vectorizer, doc_term, num_topics, passes, chunksize, workers, text = story):
    """
    A function for LDA topic modeling.
    Inputs: vectorizer, document-term matrix,, number of topics, number of passes, chucksize, number of workers.
    Output: LDA model and a list of generated topics.
    """
    
    doc_word = doc_term.transpose()
    corpus = matutils.Sparse2Corpus(doc_word)
    id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())
    ldaMulticore = models.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word, 
                                       passes=passes, chunksize = chunksize, workers = workers)
    topic_list = ldaMulticore.print_topics()
    return ldaMulticore, topic_list

In [137]:
ldaMulticore_tf1_1, ldaMulticore_tf1_1_topics = LDAMulticore_model(vectorizer = tf1, doc_term = tf1_matrix, 
                                                               num_topics = 5, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf1_1, ldaMulticore_tf1_1_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bb41b8fd0>,
 [(0,
   '0.000*"old woman" + 0.000*"bush rat" + 0.000*"main street" + 0.000*"white corn" + 0.000*"blue corn" + 0.000*"corn blue" + 0.000*"food cow" + 0.000*"little lamb" + 0.000*"yellow dog" + 0.000*"great spirit"'),
  (1,
   '0.001*"old man" + 0.001*"old woman" + 0.000*"young man" + 0.000*"long time" + 0.000*"say king" + 0.000*"run away" + 0.000*"little house" + 0.000*"water demon" + 0.000*"king daughter" + 0.000*"little boy"'),
  (2,
   '0.001*"old man" + 0.000*"hoo hoo" + 0.000*"say musician" + 0.000*"birch tree" + 0.000*"head servant" + 0.000*"mountain lion" + 0.000*"bend break" + 0.000*"lady moon" + 0.000*"war eagle" + 0.000*"little brother"'),
  (3,
   '0.001*"ou jackalse" + 0.001*"ou wolf" + 0.001*"se ou" + 0.000*"little hahsie" + 0.000*"old man" + 0.000*"king lion" + 0.000*"white feather" + 0.000*"old hendrik" + 0.000*"ou sculpat" + 0.000*"sea serpent"'),
  (4,
   '0.000*"oo oo" + 0.000*"oom jakhal" + 0.000*"little 

In [138]:
ldaMulticore_tf2_1, ldaMulticore_tf2_1_topics = LDAMulticore_model(vectorizer = tf2, doc_term = tf2_matrix, 
                                                               num_topics = 5, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf2_1, ldaMulticore_tf2_1_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6c44793290>,
 [(0,
   '0.001*"make sign cross" + 0.000*"old man leave" + 0.000*"ragnarök twilight god" + 0.000*"say old man" + 0.000*"exclaim foolish creature" + 0.000*"bear touch dead" + 0.000*"long time ago" + 0.000*"earth open eye" + 0.000*"year ago live" + 0.000*"tell wonderful story"'),
  (1,
   '0.001*"little old woman" + 0.001*"say old woman" + 0.001*"say old man" + 0.001*"old woman say" + 0.001*"say king son" + 0.001*"old woman come" + 0.001*"little old man" + 0.001*"old man say" + 0.001*"say young man" + 0.001*"great deal money"'),
  (2,
   '0.001*"say old man" + 0.001*"se ou jackalse" + 0.001*"old man say" + 0.001*"long long ago" + 0.001*"old man tell" + 0.001*"se ou wolf" + 0.001*"say old woman" + 0.001*"time old man" + 0.001*"old man make" + 0.001*"tell old man"'),
  (3,
   '0.001*"village cream puff" + 0.001*"fling open door" + 0.001*"happy hunting ground" + 0.001*"long time ago" + 0.001*"mount horse ride" + 0.001*"gold buck

In [139]:
ldaMulticore_tf3_1, ldaMulticore_tf3_1_topics = LDAMulticore_model(vectorizer = tf3, doc_term = tf3_matrix, 
                                                               num_topics = 5, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf3_1, ldaMulticore_tf3_1_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bdbbb1450>,
 [(0,
   '0.001*"old man" + 0.001*"old woman" + 0.000*"young man" + 0.000*"long time" + 0.000*"say king" + 0.000*"hoo hoo" + 0.000*"water demon" + 0.000*"king daughter" + 0.000*"yellow lily" + 0.000*"poor man"'),
  (1,
   '0.001*"ou jackalse" + 0.001*"ou wolf" + 0.000*"se ou" + 0.000*"little hahsie" + 0.000*"se ou jackalse" + 0.000*"white feather" + 0.000*"king lion" + 0.000*"ou sculpat" + 0.000*"old hendrik" + 0.000*"se hahsie"'),
  (2,
   '0.001*"old man" + 0.000*"war eagle" + 0.000*"birch tree" + 0.000*"mountain lion" + 0.000*"black fellow" + 0.000*"white corn" + 0.000*"bend break" + 0.000*"blue corn" + 0.000*"corn blue" + 0.000*"yellow dog"'),
  (3,
   '0.000*"bush rat" + 0.000*"young man" + 0.000*"big brother" + 0.000*"little bird" + 0.000*"old farmer" + 0.000*"golden haired" + 0.000*"thou lt" + 0.000*"dead body" + 0.000*"beautiful field" + 0.000*"little brother"'),
  (4,
   '0.000*"old woman" + 0.000*"main street" + 0.

In [164]:
ldaMulticore_tf1_2, ldaMulticore_tf1_2_topics = LDAMulticore_model(vectorizer = tf1, doc_term = tf1_matrix, 
                                                               num_topics = 8, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf1_2, ldaMulticore_tf1_2_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bb217add0>,
 [(0,
   '0.001*"yellow lily" + 0.000*"old man" + 0.000*"white corn" + 0.000*"say musician" + 0.000*"grandmother spider" + 0.000*"rich brother" + 0.000*"thou lt" + 0.000*"say peasant" + 0.000*"ruler heaven" + 0.000*"time cat"'),
  (1,
   '0.002*"ou jackalse" + 0.001*"ou wolf" + 0.001*"se ou" + 0.000*"black fellow" + 0.000*"young man" + 0.000*"old man" + 0.000*"little bird" + 0.000*"head servant" + 0.000*"year ago" + 0.000*"thou art"'),
  (2,
   '0.001*"water demon" + 0.000*"old man" + 0.000*"white corn" + 0.000*"blue corn" + 0.000*"corn blue" + 0.000*"yellow dog" + 0.000*"mrs brien" + 0.000*"little brother" + 0.000*"christmas eve" + 0.000*"golden basin"'),
  (3,
   '0.001*"hoo hoo" + 0.000*"little lamb" + 0.000*"se ole" + 0.000*"animal bird" + 0.000*"dat honey" + 0.000*"left foot" + 0.000*"pull sea" + 0.000*"honey se" + 0.000*"gum tree" + 0.000*"kill eat"'),
  (4,
   '0.001*"old man" + 0.001*"old woman" + 0.001*"young man" +

In [141]:
ldaMulticore_tf2_2, ldaMulticore_tf2_2_topics = LDAMulticore_model(vectorizer = tf2, doc_term = tf2_matrix, 
                                                               num_topics = 8, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf2_2, ldaMulticore_tf2_2_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6c18a29990>,
 [(0,
   '0.001*"fling open door" + 0.001*"ragnarök twilight god" + 0.001*"head rest hand" + 0.001*"little bird say" + 0.001*"long long ago" + 0.001*"mr fox come" + 0.000*"think good way" + 0.000*"word come blow" + 0.000*"open door let" + 0.000*"morning poor man"'),
  (1,
   '0.001*"die broken heart" + 0.001*"cow cow milk" + 0.001*"tell young woman" + 0.001*"say mrs brien" + 0.001*"hear sweet voice" + 0.001*"bear touch dead" + 0.001*"mrs brien say" + 0.001*"old woman house" + 0.001*"say old man" + 0.001*"seven year old"'),
  (2,
   '0.003*"se ou jackalse" + 0.001*"se ou wolf" + 0.001*"ou jackalse yust" + 0.001*"ju ju man" + 0.001*"tell year ago" + 0.001*"village cream puff" + 0.001*"exclaim foolish creature" + 0.001*"cahnt har ly" + 0.001*"say old woman" + 0.000*"policeman village cream"'),
  (3,
   '0.001*"say old man" + 0.001*"old man say" + 0.001*"old man tell" + 0.001*"rip van winkle" + 0.001*"old man leave" + 0.001*"say

In [142]:
ldaMulticore_tf3_2, ldaMulticore_tf3_2_topics = LDAMulticore_model(vectorizer = tf3, doc_term = tf3_matrix, 
                                                               num_topics = 8, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf3_2, ldaMulticore_tf3_2_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6beecf8050>,
 [(0,
   '0.000*"white corn" + 0.000*"food cow" + 0.000*"blue corn" + 0.000*"corn blue" + 0.000*"old man" + 0.000*"old dragon" + 0.000*"time cat" + 0.000*"manhattan island" + 0.000*"cow cow" + 0.000*"cat tail"'),
  (1,
   '0.001*"old man" + 0.001*"old woman" + 0.001*"young man" + 0.000*"say king" + 0.000*"little hahsie" + 0.000*"long time" + 0.000*"say old" + 0.000*"king say" + 0.000*"say prince" + 0.000*"till come"'),
  (2,
   '0.000*"white corn" + 0.000*"say musician" + 0.000*"grandmother spider" + 0.000*"beautiful field" + 0.000*"christmas tree" + 0.000*"little goat" + 0.000*"old man" + 0.000*"dive bring" + 0.000*"kill eat" + 0.000*"house boy"'),
  (3,
   '0.001*"hoo hoo" + 0.000*"bush rat" + 0.000*"sea serpent" + 0.000*"little bird" + 0.000*"holy virgin" + 0.000*"say virgin" + 0.000*"come pursuit" + 0.000*"ruler heaven" + 0.000*"king sheep" + 0.000*"animal bird"'),
  (4,
   '0.001*"old man" + 0.000*"water demon" + 0.000*

In [172]:
ldaMulticore_tf1_3, ldaMulticore_tf1_3_topics = LDAMulticore_model(vectorizer = tf1, doc_term = tf1_matrix, 
                                                               num_topics = 10, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf1_3, ldaMulticore_tf1_3_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bb3f3d390>,
 [(0,
   '0.000*"say musician" + 0.000*"lady moon" + 0.000*"little hare" + 0.000*"little brother" + 0.000*"great spirit" + 0.000*"say moon" + 0.000*"die die" + 0.000*"christ child" + 0.000*"die live" + 0.000*"old man"'),
  (1,
   '0.000*"food cow" + 0.000*"little bird" + 0.000*"old dragon" + 0.000*"point eye" + 0.000*"fly cow" + 0.000*"st peter" + 0.000*"stroke cut" + 0.000*"look balloon" + 0.000*"wintry day" + 0.000*"stoop stroke"'),
  (2,
   '0.002*"old man" + 0.001*"old woman" + 0.001*"young man" + 0.001*"say king" + 0.000*"long time" + 0.000*"run away" + 0.000*"say old" + 0.000*"king daughter" + 0.000*"little girl" + 0.000*"come home"'),
  (3,
   '0.000*"oo oo" + 0.000*"big brother" + 0.000*"rich brother" + 0.000*"oom jakhal" + 0.000*"manhattan island" + 0.000*"little brother" + 0.000*"poor brother" + 0.000*"head chief" + 0.000*"dead body" + 0.000*"wirreenun say"'),
  (4,
   '0.001*"black fellow" + 0.001*"sea serpent" + 

In [144]:
ldaMulticore_tf2_3, ldaMulticore_tf2_3_topics = LDAMulticore_model(vectorizer = tf2, doc_term = tf2_matrix, 
                                                               num_topics = 10, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf2_3, ldaMulticore_tf2_3_topics


(<gensim.models.ldamulticore.LdaMulticore at 0x7f6c4646b0d0>,
 [(0,
   '0.001*"say old man" + 0.001*"door fling open" + 0.001*"say old woman" + 0.001*"seven year day" + 0.001*"cow cow milk" + 0.001*"poor old man" + 0.001*"run away master" + 0.001*"live far away" + 0.001*"wait long time" + 0.001*"oh yes say"'),
  (1,
   '0.001*"little old woman" + 0.001*"old woman say" + 0.001*"mount horse ride" + 0.001*"say old man" + 0.001*"water ju ju" + 0.001*"exclaim foolish creature" + 0.001*"come gather honey" + 0.001*"draw nigh unto" + 0.001*"poor man say" + 0.001*"say poor man"'),
  (2,
   '0.001*"happy hunting ground" + 0.001*"say old woman" + 0.001*"rip van winkle" + 0.001*"old man say" + 0.001*"say old man" + 0.001*"say old peter" + 0.001*"tell wonderful story" + 0.001*"hear voice look" + 0.001*"say mrs brien" + 0.001*"long long ago"'),
  (3,
   '0.002*"face blind man" + 0.002*"potato face blind" + 0.001*"old man daughter" + 0.001*"say old man" + 0.001*"say potato face" + 0.001*"old woman sa

In [173]:
ldaMulticore_tf3_3, ldaMulticore_tf3_3_topics = LDAMulticore_model(vectorizer = tf3, doc_term = tf3_matrix, 
                                                               num_topics = 10, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf3_3, ldaMulticore_tf3_3_topics


(<gensim.models.ldamulticore.LdaMulticore at 0x7f6c186479d0>,
 [(0,
   '0.000*"little lamb" + 0.000*"food cow" + 0.000*"cow cow" + 0.000*"shall walk" + 0.000*"cat tail" + 0.000*"point eye" + 0.000*"fly cow" + 0.000*"love slay" + 0.000*"way quiet" + 0.000*"holy virgin"'),
  (1,
   '0.001*"hoo hoo" + 0.000*"overtake throw" + 0.000*"pocket handkerchief" + 0.000*"christmas tree" + 0.000*"time cat" + 0.000*"great spirit" + 0.000*"dead body" + 0.000*"serpent king" + 0.000*"seize axe" + 0.000*"stroke cut"'),
  (2,
   '0.001*"old man" + 0.001*"yellow lily" + 0.001*"war eagle" + 0.000*"head servant" + 0.000*"mice people" + 0.000*"brown sister" + 0.000*"king sheep" + 0.000*"young man" + 0.000*"little boy" + 0.000*"captain guard"'),
  (3,
   '0.000*"golden ducat" + 0.000*"wood chopper" + 0.000*"make axe" + 0.000*"certain wood" + 0.000*"cool sweet" + 0.000*"eye bit" + 0.000*"oak forest" + 0.000*"pride joy" + 0.000*"fine tree" + 0.000*"blow away"'),
  (4,
   '0.000*"black fellow" + 0.000*"white cor

In [146]:
ldaMulticore_cv1_1, ldaMulticore_cv1_1_topics = LDAMulticore_model(vectorizer = cv1, doc_term = cv1_matrix, 
                                                               num_topics = 5, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_cv1_1, ldaMulticore_cv1_1_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bdecb6fd0>,
 [(0,
   '0.004*"young man" + 0.002*"old woman" + 0.002*"old man" + 0.001*"long time" + 0.001*"king daughter" + 0.001*"return home" + 0.001*"come home" + 0.001*"far away" + 0.001*"father mother" + 0.001*"tell story"'),
  (1,
   '0.003*"old woman" + 0.002*"old man" + 0.001*"say king" + 0.001*"poor man" + 0.001*"young man" + 0.001*"come home" + 0.001*"run away" + 0.001*"man say" + 0.001*"long time" + 0.001*"open door"'),
  (2,
   '0.003*"old man" + 0.002*"old woman" + 0.001*"young man" + 0.001*"little girl" + 0.001*"little boy" + 0.001*"look like" + 0.001*"long time" + 0.001*"say old" + 0.001*"far away" + 0.001*"fly away"'),
  (3,
   '0.003*"ou jackalse" + 0.003*"old woman" + 0.002*"king son" + 0.002*"ou wolf" + 0.002*"young man" + 0.001*"say king" + 0.001*"old man" + 0.001*"se ou" + 0.001*"king daughter" + 0.001*"thou hast"'),
  (4,
   '0.008*"old man" + 0.003*"old woman" + 0.002*"say old" + 0.001*"little girl" + 0.001*"long 

In [147]:
ldaMulticore_cv1_2, ldaMulticore_cv1_2_topics = LDAMulticore_model(vectorizer = cv1, doc_term = cv1_matrix, 
                                                               num_topics = 8, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_cv1_2, ldaMulticore_cv1_2_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bdf07d810>,
 [(0,
   '0.010*"old man" + 0.006*"old woman" + 0.003*"young man" + 0.002*"say old" + 0.001*"man say" + 0.001*"long time" + 0.001*"come home" + 0.001*"little girl" + 0.001*"man come" + 0.001*"little boy"'),
  (1,
   '0.001*"young man" + 0.001*"dweller asgard" + 0.001*"old woman" + 0.001*"tsarevich ivan" + 0.001*"say king" + 0.001*"far away" + 0.001*"old man" + 0.001*"long time" + 0.001*"ju ju" + 0.001*"look like"'),
  (2,
   '0.010*"ou jackalse" + 0.007*"ou wolf" + 0.004*"se ou" + 0.002*"little hahsie" + 0.001*"say lad" + 0.001*"ou sculpat" + 0.001*"mr fox" + 0.001*"king lion" + 0.001*"old hendrik" + 0.001*"dat se"'),
  (3,
   '0.002*"poor man" + 0.002*"old woman" + 0.001*"mrs brien" + 0.001*"good people" + 0.001*"say king" + 0.001*"rich man" + 0.001*"rich brother" + 0.001*"st peter" + 0.001*"say man" + 0.001*"young man"'),
  (4,
   '0.002*"old woman" + 0.001*"king son" + 0.001*"king daughter" + 0.001*"old man" + 0.001*"say 

In [148]:
ldaMulticore_tf1_4, ldaMulticore_tf1_4_topics = LDAMulticore_model(vectorizer = tf1, doc_term = tf1_matrix, 
                                                               num_topics = 15, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf1_4, ldaMulticore_tf1_4_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bb2104250>,
 [(0,
   '0.000*"dry grass" + 0.000*"say stag" + 0.000*"holy place" + 0.000*"jack knife" + 0.000*"away tree" + 0.000*"ring centre" + 0.000*"black river" + 0.000*"green mountain" + 0.000*"pot hole" + 0.000*"look cave"'),
  (1,
   '0.001*"oo oo" + 0.001*"oom jakhal" + 0.000*"little bird" + 0.000*"oom leeuw" + 0.000*"drinking horn" + 0.000*"mother sheep" + 0.000*"animal bird" + 0.000*"dead body" + 0.000*"eagle hawk" + 0.000*"just reach"'),
  (2,
   '0.001*"food cow" + 0.001*"ruler heaven" + 0.000*"point eye" + 0.000*"fly cow" + 0.000*"water melon" + 0.000*"yang oerlang" + 0.000*"game hunter" + 0.000*"mud bank" + 0.000*"use big" + 0.000*"share food"'),
  (3,
   '0.001*"water demon" + 0.001*"sea serpent" + 0.001*"rich brother" + 0.001*"head servant" + 0.001*"poor brother" + 0.001*"golden ducat" + 0.000*"manhattan island" + 0.000*"great spirit" + 0.000*"water salt" + 0.000*"old woman"'),
  (4,
   '0.001*"yellow lily" + 0.000*"st a

In [165]:
ldaMulticore_tf1_5, ldaMulticore_tf1_5_topics = LDAMulticore_model(vectorizer = tf1, doc_term = tf1_matrix, 
                                                               num_topics = 12, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf1_5, ldaMulticore_tf1_5_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bb1ebcb50>,
 [(0,
   '0.001*"yellow lily" + 0.001*"white corn" + 0.001*"little house" + 0.000*"grandmother spider" + 0.000*"yellow dog" + 0.000*"croak frog" + 0.000*"dead body" + 0.000*"little goat" + 0.000*"hide hill" + 0.000*"time dog"'),
  (1,
   '0.000*"lady moon" + 0.000*"white corn" + 0.000*"blue corn" + 0.000*"corn blue" + 0.000*"great spirit" + 0.000*"thou lt" + 0.000*"dat honey" + 0.000*"blow away" + 0.000*"long island" + 0.000*"ole missis"'),
  (2,
   '0.000*"bend break" + 0.000*"birch tree" + 0.000*"drinking horn" + 0.000*"mother sheep" + 0.000*"wirreenun say" + 0.000*"live earth" + 0.000*"blow hard" + 0.000*"sun moon" + 0.000*"water come" + 0.000*"sun ask"'),
  (3,
   '0.003*"ou jackalse" + 0.002*"ou wolf" + 0.001*"se ou" + 0.001*"little hahsie" + 0.001*"king lion" + 0.001*"old hendrik" + 0.001*"ou sculpat" + 0.001*"se hahsie" + 0.000*"big brother" + 0.000*"se ole"'),
  (4,
   '0.001*"main street" + 0.000*"little lamb" + 0.0

In [150]:
ldaMulticore_tf4_1, ldaMulticore_tf4_1_topics = LDAMulticore_model(vectorizer = tf4, doc_term = tf4_matrix, 
                                                               num_topics = 8, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf4_1, ldaMulticore_tf4_1_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6c447ba090>,
 [(0,
   '0.000*"water demon" + 0.000*"sea serpent" + 0.000*"white corn" + 0.000*"grandmother spider" + 0.000*"thou lt" + 0.000*"brown sister" + 0.000*"wirreenun say" + 0.000*"st anthony" + 0.000*"hold bone" + 0.000*"day feast"'),
  (1,
   '0.000*"say musician" + 0.000*"white corn" + 0.000*"christmas tree" + 0.000*"blue corn" + 0.000*"christ child" + 0.000*"corn blue" + 0.000*"king sheep" + 0.000*"serpent king" + 0.000*"little goat" + 0.000*"fine tree"'),
  (2,
   '0.000*"hoo hoo" + 0.000*"yellow lily" + 0.000*"bush rat" + 0.000*"bend break" + 0.000*"ou sculpat" + 0.000*"croak frog" + 0.000*"brown sister" + 0.000*"hide hill" + 0.000*"ruler heaven" + 0.000*"blow hard"'),
  (3,
   '0.000*"big brother" + 0.000*"oo oo" + 0.000*"golden haired" + 0.000*"food cow" + 0.000*"oom jakhal" + 0.000*"king gold" + 0.000*"church yard" + 0.000*"se ole" + 0.000*"water death" + 0.000*"overtake throw"'),
  (4,
   '0.000*"little lamb" + 0.000*"o

In [151]:
################  The best LDA Model  ################

ldaMulticore_tf4_2, ldaMulticore_tf4_2_topics = LDAMulticore_model(vectorizer = tf4, doc_term = tf4_matrix, 
                                                               num_topics = 10, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf4_2, ldaMulticore_tf4_2_topics

# 0: poor vs. rich; 1: holiday; 2:family; 3: farm; 4: human&animals; 5: human&animals; 6: nature; 7: animal characters;
# 8: animal characters; 9:love

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bb210c910>,
 [(0,
   '0.001*"main street" + 0.001*"white corn" + 0.000*"rich brother" + 0.000*"old farmer" + 0.000*"grandmother spider" + 0.000*"old dragon" + 0.000*"poor brother" + 0.000*"man moon" + 0.000*"think peasant" + 0.000*"st anthony"'),
  (1,
   '0.000*"white corn" + 0.000*"mrs brien" + 0.000*"blue corn" + 0.000*"corn blue" + 0.000*"christmas tree" + 0.000*"christ child" + 0.000*"spin spin" + 0.000*"tir na" + 0.000*"magic ring" + 0.000*"climb sky"'),
  (2,
   '0.001*"oo oo" + 0.001*"bend break" + 0.001*"big brother" + 0.001*"ou sculpat" + 0.000*"little hahsie" + 0.000*"croak frog" + 0.000*"brown sister" + 0.000*"oom jakhal" + 0.000*"hide hill" + 0.000*"blow hard"'),
  (3,
   '0.000*"food cow" + 0.000*"drinking horn" + 0.000*"time cat" + 0.000*"old chief" + 0.000*"mother sheep" + 0.000*"say moon" + 0.000*"steal food" + 0.000*"wirreenun say" + 0.000*"point eye" + 0.000*"fly cow"'),
  (4,
   '0.001*"bush rat" + 0.001*"head servan

In [217]:
lda_model = [ldaMulticore_tf4_2, ldaMulticore_tf4_2_topics]

with open('pickle_files/lda_bestmodel.pickle', 'wb') as to_write:
    pickle.dump(lda_model, to_write)

with open('pickle_files/lda_doc_term.pickle', 'wb') as to_write:
    pickle.dump(tf4_matrix, to_write)

In [153]:
ldaMulticore_tf4_3, ldaMulticore_tf4_3_topics = LDAMulticore_model(vectorizer = tf4, doc_term = tf4_matrix, 
                                                               num_topics = 12, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf4_3, ldaMulticore_tf4_3_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bb1fc0090>,
 [(0,
   '0.001*"little hahsie" + 0.001*"king lion" + 0.001*"ou sculpat" + 0.001*"white corn" + 0.001*"oo oo" + 0.001*"se hahsie" + 0.000*"old farmer" + 0.000*"grandmother spider" + 0.000*"rich brother" + 0.000*"oom jakhal"'),
  (1,
   '0.001*"old dragon" + 0.000*"mango fruit" + 0.000*"poor brâhmaṇ" + 0.000*"mango tree" + 0.000*"yangtze kiang" + 0.000*"mice people" + 0.000*"milky way" + 0.000*"bring fruit" + 0.000*"nether world" + 0.000*"magic wand"'),
  (2,
   '0.004*"ou jackalse" + 0.002*"ou wolf" + 0.001*"se ou" + 0.001*"water demon" + 0.001*"sea serpent" + 0.000*"dat se" + 0.000*"holy mother" + 0.000*"ho old" + 0.000*"dusty road" + 0.000*"ou baviyàan"'),
  (3,
   '0.001*"lady moon" + 0.000*"white corn" + 0.000*"mrs brien" + 0.000*"blue corn" + 0.000*"corn blue" + 0.000*"old chief" + 0.000*"christmas tree" + 0.000*"christ child" + 0.000*"hold bone" + 0.000*"farmer greatly"'),
  (4,
   '0.001*"black fellow" + 0.001*"say mu

In [154]:
ldaMulticore_tf5_1, ldaMulticore_tf5_1_topics = LDAMulticore_model(vectorizer = tf5, doc_term = tf5_matrix, 
                                                               num_topics = 8, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf5_1, ldaMulticore_tf5_1_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bac670c90>,
 [(0,
   '0.001*"ou wolf" + 0.001*"ou jackalse" + 0.000*"croak frog" + 0.000*"hide hill" + 0.000*"food cow" + 0.000*"old dragon" + 0.000*"ruler heaven" + 0.000*"brown sister" + 0.000*"wirreenun say" + 0.000*"wood chopper"'),
  (1,
   '0.000*"ou sculpat" + 0.000*"little hahsie" + 0.000*"brown sister" + 0.000*"beautiful field" + 0.000*"yum yum" + 0.000*"spin spin" + 0.000*"st anthony" + 0.000*"poor brâhmaṇ" + 0.000*"mango fruit" + 0.000*"se hahsie"'),
  (2,
   '0.001*"yellow lily" + 0.001*"water demon" + 0.001*"bush rat" + 0.000*"say musician" + 0.000*"lady moon" + 0.000*"white corn" + 0.000*"blue corn" + 0.000*"corn blue" + 0.000*"man moon" + 0.000*"look balloon"'),
  (3,
   '0.000*"oo oo" + 0.000*"king lion" + 0.000*"oom jakhal" + 0.000*"sun ask" + 0.000*"oom leeuw" + 0.000*"pull sea" + 0.000*"little hahsie" + 0.000*"let monkey" + 0.000*"hold bone" + 0.000*"drink blood"'),
  (4,
   '0.001*"hoo hoo" + 0.000*"think peasant" + 

In [155]:
ldaMulticore_tf5_2, ldaMulticore_tf5_2_topics = LDAMulticore_model(vectorizer = tf5, doc_term = tf5_matrix, 
                                                               num_topics = 10, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf5_2, ldaMulticore_tf5_2_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bd4c5ad10>,
 [(0,
   '0.001*"white corn" + 0.000*"grandmother spider" + 0.000*"thou lt" + 0.000*"time cat" + 0.000*"beautiful field" + 0.000*"church yard" + 0.000*"climb sky" + 0.000*"man wolf" + 0.000*"hen home" + 0.000*"work thou"'),
  (1,
   '0.001*"bend break" + 0.001*"sea serpent" + 0.001*"big brother" + 0.001*"food cow" + 0.000*"stroke cut" + 0.000*"wintry day" + 0.000*"stoop stroke" + 0.000*"point eye" + 0.000*"fly cow" + 0.000*"say gratitude"'),
  (2,
   '0.003*"ou jackalse" + 0.002*"ou wolf" + 0.001*"king lion" + 0.001*"little hahsie" + 0.001*"ou sculpat" + 0.001*"say musician" + 0.001*"oo oo" + 0.001*"se hahsie" + 0.000*"oom jakhal" + 0.000*"yellow dog"'),
  (3,
   '0.001*"hoo hoo" + 0.001*"mountain lion" + 0.001*"head servant" + 0.000*"think peasant" + 0.000*"manhattan island" + 0.000*"oom leeuw" + 0.000*"overtake throw" + 0.000*"sit bundle" + 0.000*"money belt" + 0.000*"old stick"'),
  (4,
   '0.000*"old dragon" + 0.000*"se 

In [156]:
ldaMulticore_tf6_1, ldaMulticore_tf6_1_topics = LDAMulticore_model(vectorizer = tf6, doc_term = tf6_matrix, 
                                                               num_topics = 8, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf6_1, ldaMulticore_tf6_1_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bb1e8c510>,
 [(0,
   '0.000*"white corn" + 0.000*"blue corn" + 0.000*"corn blue" + 0.000*"st anthony" + 0.000*"wirreenun say" + 0.000*"water jar" + 0.000*"poor brâhmaṇ" + 0.000*"mango fruit" + 0.000*"mango tree" + 0.000*"man wolf"'),
  (1,
   '0.000*"big brother" + 0.000*"brown sister" + 0.000*"little goat" + 0.000*"manhattan island" + 0.000*"long island" + 0.000*"yum yum" + 0.000*"golden basin" + 0.000*"st george" + 0.000*"end island" + 0.000*"white fat"'),
  (2,
   '0.002*"ou jackalse" + 0.001*"ou wolf" + 0.000*"hoo hoo" + 0.000*"se ou jackalse" + 0.000*"sea serpent" + 0.000*"head servant" + 0.000*"yellow dog" + 0.000*"old dragon" + 0.000*"food cow" + 0.000*"captain guard"'),
  (3,
   '0.000*"yellow lily" + 0.000*"bend break" + 0.000*"white corn" + 0.000*"grandmother spider" + 0.000*"shall walk" + 0.000*"brown sister" + 0.000*"church yard" + 0.000*"say grasshopper" + 0.000*"look balloon" + 0.000*"yes yes yes"'),
  (4,
   '0.000*"ruler

In [157]:
ldaMulticore_tf6_2, ldaMulticore_tf6_2_topics = LDAMulticore_model(vectorizer = tf6, doc_term = tf6_matrix, 
                                                               num_topics = 10, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf6_2, ldaMulticore_tf6_2_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bb1f1cf90>,
 [(0,
   '0.001*"yellow lily" + 0.000*"old dragon" + 0.000*"stroke cut" + 0.000*"stoop stroke" + 0.000*"wintry day" + 0.000*"cut serpent" + 0.000*"say gratitude" + 0.000*"slowly come" + 0.000*"child death" + 0.000*"seize axe"'),
  (1,
   '0.001*"water demon" + 0.000*"lady moon" + 0.000*"brown sister" + 0.000*"yellow dog" + 0.000*"think peasant" + 0.000*"yum yum" + 0.000*"say grasshopper" + 0.000*"white fat" + 0.000*"say deer" + 0.000*"time moon"'),
  (2,
   '0.001*"hoo hoo" + 0.000*"food cow" + 0.000*"brown sister" + 0.000*"beautiful field" + 0.000*"certain wood" + 0.000*"wood chopper" + 0.000*"make axe" + 0.000*"eye bit" + 0.000*"cool sweet" + 0.000*"oak forest"'),
  (3,
   '0.001*"mountain lion" + 0.001*"head servant" + 0.000*"white corn" + 0.000*"blue corn" + 0.000*"corn blue" + 0.000*"time cat" + 0.000*"eagle hawk" + 0.000*"manhattan island" + 0.000*"flask water" + 0.000*"say sparrow"'),
  (4,
   '0.000*"big brother" + 0

In [160]:
ldaMulticore_tf1_6, ldaMulticore_tf1_6_topics = LDAMulticore_model(vectorizer = tf1, doc_term = tf1_matrix, 
                                                               num_topics = 20, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf1_6, ldaMulticore_tf1_6_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bb2103510>,
 [(0,
   '0.001*"brown sister" + 0.000*"yes baasje" + 0.000*"kill eat" + 0.000*"say farmer" + 0.000*"fat tail" + 0.000*"old brown" + 0.000*"fat say" + 0.000*"eat rat" + 0.000*"lot corn" + 0.000*"girl cat"'),
  (1,
   '0.001*"birch tree" + 0.001*"bend break" + 0.001*"lady moon" + 0.000*"little goat" + 0.000*"rich brother" + 0.000*"blow hard" + 0.000*"spin spin" + 0.000*"die die" + 0.000*"poor brother" + 0.000*"tree bend"'),
  (2,
   '0.005*"ou jackalse" + 0.003*"ou wolf" + 0.002*"se ou" + 0.001*"little hahsie" + 0.001*"king lion" + 0.001*"ou sculpat" + 0.001*"se hahsie" + 0.001*"old hendrik" + 0.001*"dat se" + 0.000*"eat fruit"'),
  (3,
   '0.005*"old man" + 0.003*"old woman" + 0.003*"young man" + 0.001*"long time" + 0.001*"say king" + 0.001*"poor man" + 0.001*"little boy" + 0.001*"run away" + 0.001*"say old" + 0.001*"little girl"'),
  (4,
   '0.000*"dead body" + 0.000*"climb sky" + 0.000*"remember message" + 0.000*"holy plac

In [161]:
ldaMulticore_tf7_1, ldaMulticore_tf7_1_topics = LDAMulticore_model(vectorizer = tf7, doc_term = tf7_matrix, 
                                                               num_topics = 8, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf7_1, ldaMulticore_tf7_1_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6bddb7f1d0>,
 [(0,
   '0.005*"coyote" + 0.004*"chan" + 0.003*"violette" + 0.003*"ourson" + 0.002*"arthur" + 0.002*"rosalie" + 0.002*"seer" + 0.002*"li" + 0.002*"ma" + 0.001*"ellen"'),
  (1,
   '0.004*"godfather" + 0.003*"iktomi" + 0.003*"brahman" + 0.002*"outa" + 0.002*"tablecloth" + 0.002*"carabao" + 0.002*"jakhal" + 0.002*"oom" + 0.002*"kangaroo" + 0.002*"sculptor"'),
  (2,
   '0.003*"rosette" + 0.002*"balloon" + 0.002*"banana" + 0.002*"peddler" + 0.002*"hoo" + 0.002*"merman" + 0.002*"simon" + 0.001*"security" + 0.001*"skyscraper" + 0.001*"redbreast"'),
  (3,
   '0.003*"dat" + 0.003*"sexton" + 0.003*"ou" + 0.003*"se" + 0.002*"johnny" + 0.002*"juan" + 0.002*"piccaninny" + 0.002*"giufà" + 0.002*"ole" + 0.002*"brâhmiṇ"'),
  (4,
   '0.004*"sigurd" + 0.003*"hedgehog" + 0.003*"caliph" + 0.002*"pope" + 0.002*"sindbad" + 0.002*"bailiff" + 0.002*"grethel" + 0.002*"anthony" + 0.002*"sequin" + 0.002*"florin"'),
  (5,
   '0.004*"tsarevich" + 0.004

In [222]:
ldaMulticore_tf8_1, ldaMulticore_tf8_1_topics = LDAMulticore_model(vectorizer = tf8, doc_term = tf8_matrix, 
                                                               num_topics = 10, passes = 100, chunksize = 500, workers = 6)
ldaMulticore_tf8_1, ldaMulticore_tf8_1_topics

(<gensim.models.ldamulticore.LdaMulticore at 0x7f6c106b9a90>,
 [(0,
   '0.001*"ou wolf" + 0.000*"yang" + 0.000*"sculptor" + 0.000*"old farmer" + 0.000*"buttonhole" + 0.000*"scarf" + 0.000*"kashim" + 0.000*"ane" + 0.000*"climb sky" + 0.000*"spin spin"'),
  (1,
   '0.000*"oerlang" + 0.000*"cow" + 0.000*"jakhal" + 0.000*"little house" + 0.000*"oo" + 0.000*"food cow" + 0.000*"jupiter" + 0.000*"croak frog" + 0.000*"tante" + 0.000*"jove"'),
  (2,
   '0.001*"weedah" + 0.000*"virgin" + 0.000*"lady moon" + 0.000*"pepper" + 0.000*"outa" + 0.000*"yellow dog" + 0.000*"fiend" + 0.000*"little lamb" + 0.000*"compound" + 0.000*"perseus"'),
  (3,
   '0.001*"dick" + 0.001*"mohawk" + 0.000*"yellow lily" + 0.000*"sprite" + 0.000*"humpie" + 0.000*"steal food" + 0.000*"pope" + 0.000*"tall tall" + 0.000*"wood chopper" + 0.000*"certain wood"'),
  (4,
   '0.001*"anthony" + 0.000*"jakhal" + 0.000*"bush rat" + 0.000*"pedro" + 0.000*"head servant" + 0.000*"curlew" + 0.000*"cheiron" + 0.000*"pottage" + 0.000*"pock

### Some observations from LDA and LSA models
- CountVectorizer is not good at differentiating topics in general.
- TFIDF with bigram is better than unigram, trigram or bigram&trigram.
- Topics make more sense then topic_num >= 8

**The best selected LDA model is used for recommender system**