In [1]:
import pandas as pd
import numpy as np
import os
import time
os.chdir('C:/Users/aacraig/Documents/ContextEdge')

In [2]:
from ContextEdge11 import ContextEdgePreprocessor

In [103]:
t0 = time.time()

# -------------------------------------------------------------------------------------------------
# This function returns the dream dataset: 'df', a dictionary of sparse tfidf matrices: 'bow_dict', 
# and a dictionary of feature lists for each text feature: 'features_dict'
# 
# option notes:
# - 'min_proportion': sets value of 'min_df' in TfidfVectorizer
# - 'max_proportion': sets value of 'max_df' in TfidfVectorizer
# - 'ngram_span': sets value of 'ngram_range' in TfidfVectorizer
# - 'perform_tfidf_ops': if set to False, only data cleaning and feature manipulations are performed.
#                        ** The returned objects 'tfidf_dict' and 'tfidf_features_dict' are empty
# - 'mode': either 'lemmatize' or 'stem'
# - 'text_features': list of the text features to include
# -------------------------------------------------------------------------------------------------

df, tfidf_dict, tfidf_features_dict = ContextEdgePreprocessor(filename = 'processed_contextedge_input_file.csv', \
                                                                  raw_input_file = False, \
                                                                  min_proportion = 200, \
                                                                  max_proportion = 0.2, \
                                                                  text_features = ['all_scenes_text'],
                                                                  ngram_span = (1,2), \
                                                                  perform_tfidf_ops = True,\
                                                                  mode = 'stem')
t1 = time.time()
print('The function took {} seconds to run.'.format(np.round(t1-t0, decimals = 1)))

No data cleaning was performed, since "raw_input_file" was set to False.


The tfidf fit for the stemmed text feature: all_scenes_text is being generated.
The tfidf fit for the stemmed text feature: all_scenes_text is complete.
There are: 1650 token features in the tfidf model.
The tfidf operations for the feature: all_scenes_text took 26.0 seconds to complete.
The function took 35.6 seconds to run.


#### Looking at the returned dataframe

In [104]:
df.head(2)

Unnamed: 0,id,name,date,scene_one,scene_two,scene_three,scene_four,location,context,year,month,day,country,state_region,all_scenes_text
0,Journal-000342,The Blood Moon,1980-05-15,Many years into my future I am driving a car t...,This dream happened many years ago but has nev...,,,"Ohio, United States",,1980,5,15,USA,Ohio,Many years into my future I am driving a car t...
1,Journal-000516,Vison of Jesus Prior to me getting saved,1981-01-10,I was living with my boyfriend at his mothers ...,,,,,,1981,1,10,,,I was living with my boyfriend at his mothers ...


# NMF Topic Model

#### Getting the tfidf matrix for 'all_scenes_text'

In [105]:
tfidf_matrix = tfidf_dict['all_scenes_text']
tfidf_matrix

<50620x1650 sparse matrix of type '<class 'numpy.float64'>'
	with 1400800 stored elements in Compressed Sparse Row format>

#### Getting the list of feature names for 'all_scenes_text'

In [106]:
tfidf_feature_names = tfidf_features_dict['all_scenes_text']
print('There are: {} features in the tfidf model for "all_scenes_text".'.format(len(tfidf_feature_names)))

There are: 1650 features in the tfidf model for "all_scenes_text".


#### Importing NMF method

In [107]:
from sklearn.decomposition import NMF

#### Setting the number of topics/components for the NMF model to construct from the tfidf model

In [108]:
no_topics = 15

#### Defining the NMF method

In [109]:
nmf = NMF(n_components=no_topics, random_state=42, alpha=0.1, l1_ratio=.2, \
          max_iter = 500, verbose = False, shuffle = True, init='nndsvd', solver = 'cd')

#### Generating the NMF topic model for the tfidf matrix

In [110]:
nmf_model = nmf.fit(tfidf_matrix)

#### Defining a helper function to display topics

In [111]:
def display_nmf_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

#### Setting the number of 'top words' to be displayed for each topic

In [112]:
no_top_words = 20

#### Displaying the topics constructed by the NMF model

In [113]:
display_nmf_topics(nmf_model, tfidf_feature_names, no_top_words)

Topic 0:
people looked white man walking black looking hand standing right sky looked like started hair beautiful big look vision light face
Topic 1:
does mean does mean mean dream dreamt dream mean does know know mean dreaming does dream upwhat upwhat does woke upwhat person dreamed vision seeing night mean thank numbers
Topic 2:
god spirit lord jesus holy word pray holy spirit bless praying god bless love christ say vision life time things prayer heard
Topic 3:
baby pregnant birth boy girl baby girl baby boy holding babies child hospital born little having months gave care old arms children
Topic 4:
car driving road seat driving car drive got drove got car cars truck parked dream driving parking going hit passenger stop driver street
Topic 5:
room door bed went open opened living room living floor window bedroom woke walked outside opened door open door sleep bathroom heard doors
Topic 6:
water fish swimming clear pool boat river ocean swim beach deep coming clean waves lake wave sea

# Analyzing the topic model

#### Defining a way to assign each dream to one or more topics in the NMF model