# Rudimentary analysis: Using the NLTK to do some simple "reading"

In this notebook, you will define a file for analysis, and the script will output interesting features & characteristics of the text.

In [None]:
# pre-configure
LIBRARY  = '/Users/eric/Documents/reader-library'
CARREL   = 'homer'
KEYWORD  = 'love'
KEYWORDS = [ 'love', 'man', 'war']



In [None]:
# configure; define some constants
TEXT = 'reader.txt'
ETC  = 'etc'
STOPWORDS = 'stopwords.txt'


In [None]:
# require
from nltk import *
from pathlib import Path


In [None]:
# read the given file, tokenize it, normalize it, and create an NLTK "Text object" from the result
library = Path( LIBRARY )
text = library/CARREL/ETC/TEXT

with open( text ) as handle : text = handle.read()
    
tokens = word_tokenize( text )
tokens = [ token.lower() for token in tokens ] 
text   = Text( tokens )


In [None]:
text.collocations()

In [None]:
# count
text.count( KEYWORD )

In [None]:
# compute relative weight percentage of a word (count/total words); to what degree is this word "significant"?
count      = text.count( KEYWORD )
total      = len( tokens )
percentage = 100 * count / total
print( percentage )

In [None]:
# compare the relative weights (percentages) of many words; begin by (re-)initializing
total       = len( tokens )
percentages = {}

# process each of the given keywords
for keyword in KEYWORDS :
    
    # calculate
    count      = text.count( keyword )
    percentage = 100 * count / total
    
    # update the list of weights
    percentages[ keyword ] = percentage

# sort the weights by value; very Pythonic
percentages = { key:value for key, value in sorted( percentages.items(), key=lambda item:item[ 1 ], reverse=True ) }

# output
print( "\t".join( ( 'keyword', 'percentage') ) )
for keyword in percentages :
    percentage = str( percentages[ keyword ] )
    print( "\t".join( ( keyword, percentage ) ) )


In [None]:
# concordance
text.concordance( KEYWORD )


In [None]:
for keyword in sorted( KEYWORDS ) :
    pattern = ( '<%s> <is> <.*>' % ( keyword ) )
    text.findall( pattern )
    print()


In [None]:
# dispersion plot; where do the keyword appear
text.dispersion_plot( KEYWORDS )


In [None]:
# similar words
for keyword in sorted( KEYWORDS ) :
    print( "%s" % ( keyword ) )
    text.similar( keyword )
    print()
