# Wordclouds - Rudimentary visualizations

How to create simple word clouds.

In [2]:
# pre-configure
LIBRARY = '/Users/eric/Documents/reader-library'
CARREL  = 'homer'
HEIGHT  = 960
WIDTH   = 1280
COLOR   = 'white'


In [None]:
# configure
ETC       = 'etc'
TEXT      = 'reader.txt'
STOPWORDS = 'stopwords.txt'


In [None]:
# require
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk import *
from pathlib import Path


In [None]:
# initialize
library   = Path( LIBRARY )
text      = library/CARREL/ETC/TEXT
stopwords = library/CARREL/ETC/STOPWORDS


In [None]:
# read text and stopwords
with open( text ) as handle : text = handle.read()
with open( stopwords ) as handle : stopwords = handle.read().split( '\n')


In [None]:
# create a list of all the tokens (words, punctuation, etc) in the data
tokens      = word_tokenize( text )
tokens      = [ token.lower() for token in tokens if token.isalpha() ]
tokens      = [ token for token in tokens if token not in stopwords ] 
frequencies = FreqDist( tokens )


In [None]:
# initialilze a word cloud and redner it
wordcloud = WordCloud( width=WIDTH, height=HEIGHT, background_color=COLOR )
plt.imshow( wordcloud.generate_from_frequencies( frequencies ) ) 
plt.axis( "off" ) 
plt.show()


In [None]:
# denote a part-of-speech tag, and output all words with that tag; initialize
P      = 'NN'
tags   = pos_tag( tokens )
tokens = {}

# process each tag
for tag in tags :
    
    # check for given part of speech
    if ( tag[ 1 ] == P ) :
    
        # update the list of pos tags
        if tag[ 0 ] in tokens : tokens[ tag[ 0 ] ] += 1
        else                  : tokens[ tag[ 0 ] ] =  1

# initialilze a word cloud, render it, and display it
wordcloud = WordCloud( width = WIDTH, height = HEIGHT, background_color = COLOR )
plt.imshow( wordcloud.generate_from_frequencies( tokens ) ) 
plt.axis( "off" ) 
plt.show()


In [None]:
# read the given file, tokenize its words, and normalize them
tokens      = word_tokenize( text )
tokens      = [ token.lower() for token in tokens if token.isalpha() ]
tokens      = [ token for token in tokens if token not in stopwords ] 
frequencies = FreqDist( tokens ).values()


In [None]:
# calculate mean (average), variance, and standard deviation
mean        = sum( frequencies ) / len( frequencies )
variance    = sum( ( frequency-mean )**2 for frequency in frequencies ) / len(frequencies)
deviation   = variance**0.5
print( "mean: %f; variance: %f; deviation: %f" % ( mean, variance, deviation ) )

In [None]:
# create a list of second-teir "interesting" words

# initialize
frequencies = FreqDist( tokens )
n           = round( mean + deviation )
interesting = {}

# loop through each token in the frequencies
for token in frequencies :

        # parse
        count = frequencies[ token ]
        
        # update, optionally
        if ( mean <= count <= n ) : interesting[ token ] = count

# output
print( interesting )

In [None]:
# initialilze a word cloud,...
wordcloud = WordCloud( width = WIDTH, height = HEIGHT, background_color = COLOR )

# ...render it, and display it
plt.imshow( wordcloud.generate_from_frequencies( interesting ) ) 
plt.axis( "off" ) 
plt.show()
