In [1]:
import pandas as pd
import numpy as np
import os
import time
os.chdir('C:/Users/aacraig/Documents/ContextEdge')

In [2]:
from ContextEdge10 import ContextEdgePreprocessor

In [293]:
t0 = time.time()

# -------------------------------------------------------------------------------------------------
# This function returns the dream dataset: 'df', a dictionary of sparse tfidf matrices: 'bow_dict', 
# and a dictionary of feature lists for each text feature: 'features_dict'
# 
# option notes:
# - 'min_proportion': sets value of 'min_df' in TfidfVectorizer
# - 'max_proportion': sets value of 'max_df' in TfidfVectorizer
# - 'ngram_span': sets value of 'ngram_range' in TfidfVectorizer
# - 'perform_tfidf_ops': if set to False, only data cleaning and feature manipulations are performed.
#                        ** The returned objects 'tfidf_dict' and 'tfidf_features_dict' are empty
# - 'mode': either 'lemmatize' or 'stem'
# - 'text_features': list of the text features to include
# -------------------------------------------------------------------------------------------------

df, tfidf_dict, tfidf_features_dict = ContextEdgePreprocessor(filename = 'processed_contextedge_input_file.csv', \
                                                                  raw_input_file = False, \
                                                                  min_proportion = 200, \
                                                                  max_proportion = 0.35, \
                                                                  text_features = ['all_scenes_text'],
                                                                  ngram_span = (1,3), \
                                                                  perform_tfidf_ops = True,\
                                                                  mode = 'stem')
t1 = time.time()
print('The function took {} seconds to run.'.format(np.round(t1-t0, decimals = 1)))

No data cleaning was performed, since "raw_input_file" was set to False.


The tfidf fit for the stemmed text feature: all_scenes_text is being generated.
The tfidf fit for the stemmed text feature: all_scenes_text is complete.
There are: 1660 token features in the tfidf model.
The tfidf operations for the feature: all_scenes_text took 36.9 seconds to complete.
The function took 49.0 seconds to run.


#### Looking at the output dataframe

In [294]:
df.head()

Unnamed: 0,id,name,date,scene_one,scene_two,scene_three,scene_four,location,context,year,month,day,country,state_region,all_scenes_text
0,Journal-000342,The Blood Moon,1980-05-15,Many years into my future I am driving a car t...,This dream happened many years ago but has nev...,,,"Ohio, United States",,1980,5,15,USA,Ohio,Many years into my future I am driving a car t...
1,Journal-000516,Vison of Jesus Prior to me getting saved,1981-01-10,I was living with my boyfriend at his mothers ...,,,,,,1981,1,10,,,I was living with my boyfriend at his mothers ...
2,Journal-001517,Losing control and fear,1992-05-19,The first dream I remember was of me knocking ...,,,,"Pennsylvania, United States",,1992,5,19,USA,Pennsylvania,The first dream I remember was of me knocking ...
3,Journal-001554,Dream Refresh Edit,1993-07-23,Dasdasdadadasdsad,,,,"Cherkas, Ukr",,1993,7,23,Ukraine,Cherkas,Dasdasdadadasdsad
4,Journal-000149,Dream,1996-08-01,Im sitting in the church school building Someo...,,,,,Joy,1996,8,1,,,Im sitting in the church school building Someo...


#### Looking at the features for 'all_scenes_text'

In [295]:
tfidf_features_dict['all_scenes_text']

['able',
 'accident',
 'acting',
 'actual',
 'actually',
 'add',
 'adult',
 'adv',
 'advance',
 'afraid',
 'afternoon',
 'age',
 'ago',
 'ago dream',
 'agreed',
 'ahead',
 'air',
 'airport',
 'al',
 'alive',
 'allow',
 'allowed',
 'alot',
 'amazed',
 'amazing',
 'amen',
 'american',
 'angel',
 'angels',
 'angry',
 'animal',
 'animals',
 'anointing',
 'answer',
 'answered',
 'answers',
 'anybody',
 'anymore',
 'apart',
 'apartment',
 'apostle',
 'apparently',
 'appear',
 'appeared',
 'appears',
 'appreci',
 'appreciate',
 'appreciated',
 'approached',
 'area',
 'arm',
 'arms',
 'army',
 'arrived',
 'ask',
 'asked',
 'asked god',
 'asking',
 'asks',
 'asleep',
 'ate',
 'attached',
 'attack',
 'attacked',
 'attacking',
 'attend',
 'attention',
 'aunt',
 'authority',
 'awake',
 'aware',
 'away',
 'awhile',
 'awoke',
 'babies',
 'baby',
 'baby boy',
 'baby girl',
 'background',
 'backyard',
 'bad',
 'bag',
 'bags',
 'ball',
 'bank',
 'barely',
 'basement',
 'basically',
 'bathroom',
 'battl

#### Looking at the sparse tfidf matrix

In [296]:
tfidf_dict['all_scenes_text']

<50620x1660 sparse matrix of type '<class 'numpy.float64'>'
	with 1463487 stored elements in Compressed Sparse Row format>

# NMF

In [297]:
from sklearn.decomposition import NMF

#### Extract tfidf matrix for feature: 'all_scenes_text'

In [298]:
tfidf_matrix = tfidf_dict['all_scenes_text']

#### Extract the tfidf feature names for the feature: 'all_scenes_text'

In [299]:
tfidf_feature_names = tfidf_features_dict['all_scenes_text']

In [300]:
len(tfidf_feature_names)

1660

#### Define base NMF method

In [301]:
nmf = NMF(n_components=8, random_state=42, alpha=0.1, l1_ratio=.2, \
          max_iter = 1000, verbose = False, shuffle = True,\
          init='nndsvd', solver = 'cd')

In [302]:
nmf_fit = nmf.fit(tfidf_matrix)

#### Looking at the topics generated by the NMF method

In [303]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [304]:
# set the number of 'top words' for each topic
no_top_words = 15

In [305]:
display_topics(nmf_fit, tfidf_feature_names, no_top_words)

Topic 0:
said like saw looked people went man came told woke got didnt started room just
Topic 1:
mean does does mean mean dream does mean dream dreamt saw dream mean does know know mean vision does dream dreaming dreamed know
Topic 2:
god church spirit lord jesus holy word bless pray people holy spirit praying god bless thank vision
Topic 3:
dreams im having know dont ive remember just having dreams help dont know night time like years
Topic 4:
baby pregnant birth girl boy baby girl baby boy holding babies child hospital born little gave husband
Topic 5:
water fish swimming clear pool river boat ocean like swim coming saw beach deep big
Topic 6:
car driving road got seat driving car drive drove got car cars truck going parked dream driving parking
Topic 7:
house door room home outside went inside husband living window old dream house family came kitchen
