# Parts-Of-Speech

In this notebook, you will define a file for analysis, and the script will output interesting features & characteristics of the text.

In [None]:
# configure
CARREL   = 'homer'
KEYWORD  = 'love'
KEYWORDS = [ 'love', 'war', 'man' ]


In [None]:
# require
from nltk import *
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from pathlib import Path


In [None]:
# initialize
library = Path( LIBRARY )
text = library/CARREL/ETC/TEXT


In [None]:
# read the given file, and do a bit of normalization against it
with open ( text ) as handle : text = handle.read()
text = text.replace( '\t', '').replace( '\n', '' )


In [None]:
# create a list of all sentences
sentences = sent_tokenize( text )


In [None]:
# denote a sentence and output it
S = 4
sentences[ 4 ]


In [None]:
# create a new list of sentences containing the given keyword
sentences = [ sentence for sentence in sentences if KEYWORD in sentence ] 
for sentence in sentences :
    print( sentence )
    print()
    

In [None]:
# identify a sentence of interest, tokenize it, and extract parts-of-speech
S    = 5
tags = pos_tag( word_tokenize( sentences[ S ] ) )

# output
tags


In [None]:
# define a simple grammar, initialize a parser, parse the tags, and output
grammar = "NP: {<DT>?<JJ>*<NN>}"
parser  = RegexpParser( grammar )
parse   = parser.parse( tags )
print( parse )


In [None]:
# count & tabulate the parts-of-speech; begin by initializing
pos = {}

# process each tag
for tag in tags :
    
    # parse
    tag = tag[ 1 ]
    
    # update the list of pos tags
    if tag in pos : pos[ tag ] += 1
    else : pos[ tag ] = 1

# sort the list; very Pythonic
pos = { key:value for key, value in sorted( pos.items(), key=lambda item:item[ 1 ], reverse=True ) }

# output
print( "\t".join( ( 'pos', 'count') ) )
for tag in pos :
    count = str( pos[ tag ] )
    print( "\t".join( ( tag, count ) ) )
    

In [None]:
# denote a part-of-speech tag, and output all words with that tag
P = 'NN'
for tag in tags :
    if ( tag[ 1 ] == P ) : print ( tag[ 0 ] )
        

In [None]:
# count & tabulate all parts-of-speech from the given file (data)
tags = pos_tag( word_tokenize( text ) )

# process each tag
for tag in tags :
    
    # parse
    tag = tag[ 1 ]
    
    # update the list of pos tags
    if tag in pos : pos[ tag ] += 1
    else          : pos[ tag ] =  1

# sort the list; very Pythonic
pos = { key:value for key, value in sorted( pos.items(), key=lambda item:item[ 1 ], reverse=True ) }

# output
print( "\t".join( ( 'pos', 'count') ) )
for tag in pos :
    count = str( pos[ tag ] )
    print( "\t".join( ( tag, count ) ) )
    

In [None]:
# denote a part-of-speech tag, and output all words with that tag; initialize
P      = 'VB'
tokens = {}

# process each tag
for tag in tags :
    
    # check for given part of speech
    if ( tag[ 1 ] == P ) :
    
        # update the list of pos tags
        if tag[ 0 ] in tokens : tokens[ tag[ 0 ] ] += 1
        else : tokens[ tag[ 0 ] ] = 1

# sort the list; very Pythonic
tokens = { key:value for key, value in sorted( tokens.items(), key=lambda item:item[ 1 ], reverse=True ) }

# output
print( "\t".join( ( 'token', 'count' ) ) )
for token in tokens :
    
    # parse and output
    count = str( tokens[ token ] )
    print( "\t".join( ( token, count ) ) )
            

In [None]:
# create a list of frequencies
frequencies = []
for token in tokens : frequencies.append( ( tokens[ token ] ) )

# plot the result
plt.hist( frequencies, bins=(max(frequencies)-min(frequencies)) )
plt.show()

In [None]:
# calculate mean (average), variance, and standard deviation
mean        = sum( frequencies ) / len( frequencies )
variance    = sum( ( frequency-mean )**2 for frequency in frequencies ) / len(frequencies)
deviation   = variance**0.5
print( "mean: %f; variance: %f; deviation: %f" % ( mean, variance, deviation ) )

In [None]:
# create a list of second teir "interesting" words

# initialize
n = round( mean + deviation )
frequencies = {}
for token in tokens :

        # re-initialize
        count = tokens[ token ]
        
        # optionally update
        if ( mean <= count <= n ) : frequencies[ token ] = count

# output
print( frequencies )

In [None]:
# initialilze a word cloud,...
wordcloud = WordCloud( width=WIDTH, height=HEIGHT, background_color=COLOR )

# ...render it, and display it
plt.imshow( wordcloud.generate_from_frequencies( frequencies ) ) 
plt.axis( "off" ) 
plt.show()