In [2]:
import csv
import pickle
from pprint import pprint
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


def train_lda_model(path):
    '''This function takes in preprocessed text and builds an LDA model with 11 topics.
    Topics were selected to optimize coherence score and subjetive interpretation of topcs.'''

    with open(path, 'r') as f:
        reader = csv.reader(f)
        data_words_nostops = list(reader)
    
    # Create Dictionary
    id2word = corpora.Dictionary(data_words_nostops)
    # Create Corpus
    texts = data_words_nostops
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]


    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=11, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    
    # pickle models, dict and corpus for later
    model_filename = "/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/models/lda_model_file_11.sav"
    pickle.dump(lda_model, open(model_filename, 'wb'))
    dic_filename = "/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/models/id2word_11.sav"
    pickle.dump(id2word, open(dic_filename, 'wb'))
    corpus_filename = "/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/models/corpus_11.sav"
    pickle.dump(corpus, open(corpus_filename, 'wb'))
    
    # Print the Keyword in the 10 topics
    pprint(lda_model.print_topics())
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_nostops, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    

    
    
train_lda_model('/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/data/processed_data/processed_dreams.csv'
)


[(0,
  '0.029*"game" + 0.021*"school" + 0.015*"several" + 0.014*"park" + '
  '0.013*"class" + 0.010*"study" + 0.010*"teacher" + 0.009*"object" + '
  '0.009*"vividly" + 0.009*"character"'),
 (1,
  '0.034*"always" + 0.029*"sleep" + 0.021*"time" + 0.020*"life" + '
  '0.017*"nightmare" + 0.017*"night" + 0.016*"experience" + 0.015*"year" + '
  '0.015*"usually" + 0.011*"often"'),
 (2,
  '0.028*"hurt" + 0.025*"boy" + 0.020*"father" + 0.019*"leg" + 0.015*"noise" + '
  '0.012*"shower" + 0.011*"stomach" + 0.011*"ring" + 0.010*"god" + '
  '0.010*"toilet"'),
 (3,
  '0.059*"car" + 0.030*"drive" + 0.020*"road" + 0.012*"husband" + 0.011*"hill" '
  '+ 0.011*"bus" + 0.011*"chair" + 0.008*"shop" + 0.008*"driver" + '
  '0.008*"attention"'),
 (4,
  '0.017*"doctor" + 0.013*"hospital" + 0.012*"wind" + 0.012*"cheat" + '
  '0.010*"allow" + 0.009*"psoriasis" + 0.008*"horror" + 0.008*"touch" + '
  '0.007*"message" + 0.007*"river"'),
 (5,
  '0.045*"room" + 0.032*"door" + 0.018*"open" + 0.017*"house" + 0.016*"wal