In [14]:
import pandas as pd

In [1]:
conda install -c conda-forge spacy

Retrieving notices: ...working... done
Channels:
 - conda-forge
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3

  added / updated specs:
    - spacy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    catalogue-2.0.10           |  py311h267d04e_0          43 KB  conda-forge
    cloudpathlib-0.19.0        |     pyhd8ed1ab_0          40 KB  conda-forge
    confection-0.1.4           |  py311h8d5925d_0          88 KB  conda-forge
    cymem-2.0.8                |  py311ha891d26_1          45 KB  conda-forge
    cython-blis-0.7.10         |  py311hb49d859_2         610 KB  conda-forge
    langcodes-3.3.0            |     pyhd8ed1ab_0         156 KB  conda-forge
    murmurhash-1.0.10          |  py311ha891d26_1          31 KB  conda-forge
    preshed-3.0.9              |  py31

In [5]:
import spacy
nlp = spacy.load( 'en_core_web_sm' )
doc = nlp( "This is a sentence." )

In [6]:
txt = 'On October 23, Apple CEO Tim Cook Unveiled the new iPad Mini, fourth generation iPad with Retina display, new iMac, and the 13-inch MacBook Pro with Retina display.'
doc = nlp( txt )
for NE in doc.ents:
    print( NE.text, ': ', NE.label_ )

October 23 :  DATE
Apple :  ORG
Tim Cook :  PERSON
iPad :  ORG
fourth :  ORDINAL
iPad :  ORG
Retina :  PERSON
iMac :  ORG
13-inch :  QUANTITY
MacBook Pro :  PERSON
Retina :  PERSON


### Practice Problem #1

In [10]:
# The text is an excerpt from 'Of Mice and Men'

text = """ Two men, dressed in denim jackets and trousers and wearing "black, shapeless hats," walk single-file down a path near the pool. Both men carry blanket rolls — called bindles — on their shoulders. The smaller, wiry man is George Milton. Behind him is Lennie Small, a huge man with large eyes and sloping shoulders, walking at a gait that makes him resemble a huge bear.

When Lennie drops near the pool's edge and begins to drink like a hungry animal, George cautions him that the water may not be good. This advice is necessary because Lennie is retarded and doesn't realize the possible dangers. The two are on their way to a ranch where they can get temporary work, and George warns Lennie not to say anything when they arrive. Because Lennie forgets things very quickly, George must make him repeat even the simplest instructions.

Lennie also likes to pet soft things. In his pocket, he has a dead mouse which George confiscates and throws into the weeds beyond the pond. Lennie retrieves the dead mouse, and George once again catches him and gives Lennie a lecture about the trouble he causes when he wants to pet soft things (they were run out of the last town because Lennie touched a girl's soft dress, and she screamed). Lennie offers to leave and go live in a cave, causing George to soften his complaint and tell Lennie perhaps they can get him a puppy that can withstand Lennie's petting.

As they get ready to eat and sleep for the night, Lennie asks George to repeat their dream of having their own ranch where Lennie will be able to tend rabbits. George does so and then warns Lennie that, if anything bad happens, Lennie is to come back to this spot and hide in the brush. Before George falls asleep, Lennie tells him they must have many rabbits of various colors."""

#### Problem question: 

Convert this text into four separate text representations:

- character
- term
- bigram, pairs of adjacent terms
- term with appropriate part-of-speech disambiguation


You can use Python's nltk or spaCy libraries to assign part-of-speech tags to terms in a term list.

#### Characater Frequency

In [None]:
# import NLTK
import nltk
nltk.download('all')

In [None]:
nltk.download('punkt')  # Tokenizer for splitting text into words and sentences
nltk.download('stopwords')  # Common stopwords like 'the', 'is', etc.
nltk.download('averaged_perceptron_tagger')  # Part-of-speech tagger

In [11]:
# character level tokenization
from nltk.tokenize import RegexpTokenizer
char_tokenizer = RegexpTokenizer(r'.', gaps=False)

# call the function on the text block
char_tokens = char_tokenizer.tokenize(text)

print(char_tokens)

[' ', 'T', 'w', 'o', ' ', 'm', 'e', 'n', ',', ' ', 'd', 'r', 'e', 's', 's', 'e', 'd', ' ', 'i', 'n', ' ', 'd', 'e', 'n', 'i', 'm', ' ', 'j', 'a', 'c', 'k', 'e', 't', 's', ' ', 'a', 'n', 'd', ' ', 't', 'r', 'o', 'u', 's', 'e', 'r', 's', ' ', 'a', 'n', 'd', ' ', 'w', 'e', 'a', 'r', 'i', 'n', 'g', ' ', '"', 'b', 'l', 'a', 'c', 'k', ',', ' ', 's', 'h', 'a', 'p', 'e', 'l', 'e', 's', 's', ' ', 'h', 'a', 't', 's', ',', '"', ' ', 'w', 'a', 'l', 'k', ' ', 's', 'i', 'n', 'g', 'l', 'e', '-', 'f', 'i', 'l', 'e', ' ', 'd', 'o', 'w', 'n', ' ', 'a', ' ', 'p', 'a', 't', 'h', ' ', 'n', 'e', 'a', 'r', ' ', 't', 'h', 'e', ' ', 'p', 'o', 'o', 'l', '.', ' ', 'B', 'o', 't', 'h', ' ', 'm', 'e', 'n', ' ', 'c', 'a', 'r', 'r', 'y', ' ', 'b', 'l', 'a', 'n', 'k', 'e', 't', ' ', 'r', 'o', 'l', 'l', 's', ' ', '—', ' ', 'c', 'a', 'l', 'l', 'e', 'd', ' ', 'b', 'i', 'n', 'd', 'l', 'e', 's', ' ', '—', ' ', 'o', 'n', ' ', 't', 'h', 'e', 'i', 'r', ' ', 's', 'h', 'o', 'u', 'l', 'd', 'e', 'r', 's', '.', ' ', 'T', 'h', 'e',

In [13]:
char_count = {}

for char in char_tokens:
    if char not in char_count:
        char_count[char] = 0
    char_count[char] +=1

print(char_count)

{' ': 316, 'T': 4, 'w': 26, 'o': 85, 'm': 30, 'e': 214, 'n': 122, ',': 17, 'd': 50, 'r': 77, 's': 102, 'i': 91, 'j': 1, 'a': 117, 'c': 34, 'k': 15, 't': 111, 'u': 27, 'g': 40, '"': 2, 'b': 22, 'l': 51, 'h': 81, 'p': 27, '-': 1, 'f': 16, '.': 15, 'B': 4, 'y': 23, '—': 2, 'G': 10, 'M': 1, 'L': 17, 'S': 1, '\n': 6, 'W': 1, "'": 4, 'v': 12, 'z': 1, 'q': 1, 'I': 1, '(': 1, ')': 1, 'A': 1}


In [19]:
char_count_df = pd.DataFrame(list(char_count.items()), columns = ['Char', 'Count'] )
char_count_df

Unnamed: 0,Char,Count
0,,316
1,T,4
2,w,26
3,o,85
4,m,30
5,e,214
6,n,122
7,",",17
8,d,50
9,r,77


#### Word Frequency

In [22]:
from nltk.tokenize import word_tokenize

In [23]:
words = word_tokenize(text)

In [24]:
word_count = {}

for word in words:
    if word not in word_count:
        word_count[word] = 0
    word_count[word] += 1

print(word_count)

{'Two': 1, 'men': 2, ',': 17, 'dressed': 1, 'in': 3, 'denim': 1, 'jackets': 1, 'and': 15, 'trousers': 1, 'wearing': 1, '``': 1, 'black': 1, 'shapeless': 1, 'hats': 1, "''": 1, 'walk': 1, 'single-file': 1, 'down': 1, 'a': 11, 'path': 1, 'near': 2, 'the': 12, 'pool': 2, '.': 15, 'Both': 1, 'carry': 1, 'blanket': 1, 'rolls': 1, '—': 2, 'called': 1, 'bindles': 1, 'on': 2, 'their': 4, 'shoulders': 2, 'The': 2, 'smaller': 1, 'wiry': 1, 'man': 2, 'is': 5, 'George': 10, 'Milton': 1, 'Behind': 1, 'him': 7, 'Lennie': 17, 'Small': 1, 'huge': 2, 'with': 1, 'large': 1, 'eyes': 1, 'sloping': 1, 'walking': 1, 'at': 1, 'gait': 1, 'that': 4, 'makes': 1, 'resemble': 1, 'bear': 1, 'When': 1, 'drops': 1, "'s": 3, 'edge': 1, 'begins': 1, 'to': 12, 'drink': 1, 'like': 1, 'hungry': 1, 'animal': 1, 'cautions': 1, 'water': 1, 'may': 1, 'not': 2, 'be': 2, 'good': 1, 'This': 1, 'advice': 1, 'necessary': 1, 'because': 2, 'retarded': 1, 'does': 2, "n't": 1, 'realize': 1, 'possible': 1, 'dangers': 1, 'two': 1, 'are

In [25]:
word_count_df = pd.DataFrame(list(word_count.items()), columns=['word','count'])

word_count_df

Unnamed: 0,word,count
0,Two,1
1,men,2
2,",",17
3,dressed,1
4,in,3
...,...,...
191,tells,1
192,have,1
193,many,1
194,various,1


#### Bigram Frequency

In [26]:
from nltk import bigrams

bigrams_list = list(bigrams(words))

print(bigrams_list)

[('Two', 'men'), ('men', ','), (',', 'dressed'), ('dressed', 'in'), ('in', 'denim'), ('denim', 'jackets'), ('jackets', 'and'), ('and', 'trousers'), ('trousers', 'and'), ('and', 'wearing'), ('wearing', '``'), ('``', 'black'), ('black', ','), (',', 'shapeless'), ('shapeless', 'hats'), ('hats', ','), (',', "''"), ("''", 'walk'), ('walk', 'single-file'), ('single-file', 'down'), ('down', 'a'), ('a', 'path'), ('path', 'near'), ('near', 'the'), ('the', 'pool'), ('pool', '.'), ('.', 'Both'), ('Both', 'men'), ('men', 'carry'), ('carry', 'blanket'), ('blanket', 'rolls'), ('rolls', '—'), ('—', 'called'), ('called', 'bindles'), ('bindles', '—'), ('—', 'on'), ('on', 'their'), ('their', 'shoulders'), ('shoulders', '.'), ('.', 'The'), ('The', 'smaller'), ('smaller', ','), (',', 'wiry'), ('wiry', 'man'), ('man', 'is'), ('is', 'George'), ('George', 'Milton'), ('Milton', '.'), ('.', 'Behind'), ('Behind', 'him'), ('him', 'is'), ('is', 'Lennie'), ('Lennie', 'Small'), ('Small', ','), (',', 'a'), ('a', 'hu

In [28]:
bigrams_count = {}

for bigram in bigrams_list:
    if bigram not in bigrams_count:
        bigrams_count[bigram] = 0
    bigrams_count[bigram] += 1

bigrams_df = pd.DataFrame(list(bigrams_count.items()), columns = ['bigram', 'count'])

bigrams_df

Unnamed: 0,bigram,count
0,"(Two, men)",1
1,"(men, ,)",1
2,"(,, dressed)",1
3,"(dressed, in)",1
4,"(in, denim)",1
...,...,...
329,"(many, rabbits)",1
330,"(rabbits, of)",1
331,"(of, various)",1
332,"(various, colors)",1


In [32]:
bigrams_df.head(50)

Unnamed: 0,bigram,count
0,"(Two, men)",1
1,"(men, ,)",1
2,"(,, dressed)",1
3,"(dressed, in)",1
4,"(in, denim)",1
5,"(denim, jackets)",1
6,"(jackets, and)",1
7,"(and, trousers)",1
8,"(trousers, and)",1
9,"(and, wearing)",1


In [33]:
bigrams_df.sort_values(by='count', ascending=False)

Unnamed: 0,bigram,count
131,"(,, and)",3
274,"(,, Lennie)",3
159,"(., Lennie)",3
175,"(dead, mouse)",2
24,"(the, pool)",2
...,...,...
112,"(n't, realize)",1
111,"(does, n't)",1
110,"(and, does)",1
109,"(retarded, and)",1


### TF-IDF

In [34]:
# Convert term vectors into gensim dictionary

import gensim

term_vec = [
    ['far', 'far', 'better', 'thing', 'ever', 'done'],
    ['call', 'ishmael'],
    ['dagger', 'see'],
    ['o', 'happi', 'dagger']
]

dict = gensim.corpora.Dictionary( term_vec )

corp = [ ]
for i in range( 0, len( term_vec ) ):
    corp.append( dict.doc2bow( term_vec[ i ] ) )

#  Create TFIDF vectors based on term vectors bag-of-word corpora

tfidf_model = gensim.models.TfidfModel( corp )

tfidf = [ ]
for i in range( 0, len( corp ) ):
    tfidf.append( tfidf_model[ corp[ i ] ] )

#  Create pairwise document similarity index

n = len( dict )
index = gensim.similarities.SparseMatrixSimilarity( tfidf_model[ corp ], num_features = n )

#  Print TFIDF vectors and pairwise similarity per document

for i in range( 0, len( tfidf ) ):
    s = 'Doc ' + str( i + 1 ) + ' TFIDF:'
    
    for j in range( 0, len( tfidf[ i ] ) ):
        s = s + ' (' + dict.get( tfidf[ i ][ j ][ 0 ] ) + ','
        s = s + ( '%.3f' % tfidf[ i ][ j ][ 1 ] ) + ')'

    print( s )

for i in range( 0, len( corp ) ):
    print( 'Doc', ( i + 1 ), 'sim: [ ', end='' )
    
    sim = index[ tfidf_model[ corp[ i ] ] ]
    for j in range( 0, len( sim ) ):
        print( '%.3f ' % sim[ j ], end='' )

    print( ']' )



Doc 1 TFIDF: (better,0.354) (done,0.354) (ever,0.354) (far,0.707) (thing,0.354)
Doc 2 TFIDF: (call,0.707) (ishmael,0.707)
Doc 3 TFIDF: (dagger,0.447) (see,0.894)
Doc 4 TFIDF: (dagger,0.333) (happi,0.667) (o,0.667)
Doc 1 sim: [ 1.000 0.000 0.000 0.000 ]
Doc 2 sim: [ 0.000 1.000 0.000 0.000 ]
Doc 3 sim: [ 0.000 0.000 1.000 0.149 ]
Doc 4 sim: [ 0.000 0.000 0.149 1.000 ]


In [35]:
import nltk
import numpy
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer

text = [\
  "It is a far, far better thing I do, than I have every done before",\
  "Call me Ishmael",\
  "Is this a dagger I see before me?",\
  "O happy dagger"\
]

#  Remove punctuation

punc = re.compile( '[%s]' % re.escape( string.punctuation ) )
for i, doc in enumerate( text ):
    text[ i ] = punc.sub( '', doc.lower() )

#  TF-IDF vectorize documents w/sklearn, remove English stop words

vect = TfidfVectorizer( stop_words='english' )
xform = vect.fit_transform( text )

#  Grab remaining terms (keys), stem, if different replace w/stem

porter = nltk.stem.porter.PorterStemmer()
for term in list( vect.vocabulary_.keys() ):
    if term == porter.stem( term ):
        continue

    v = vect.vocabulary_[ term ]
    del vect.vocabulary_[ term ]
    vect.vocabulary_[ porter.stem( term ) ] = v

#  Get final key/value lists

key = list( vect.vocabulary_.keys() )
val = list( vect.vocabulary_.values() )

# Print out formatted TF-IDF scores per term per document

row, col = xform.nonzero()

cur_doc = 0
s = 'Doc 1 TFIDF: '

for i, c in enumerate( col ):
    term = key[ val.index( c ) ]
    tfidf = xform[ row[ i ], c ]
    
    if row[ i ] != cur_doc:        #  New document?
        print( s )                 #  Print prev doc's terms/TFIDF weights
        
        cur_doc = row[ i ]         #  Record new doc's ID
        s = 'Doc ' + str( cur_doc + 1 ) + ' TFIDF:'

    s = s + ' (' + term + ','      #  Add current term/TFIDF pair
    s = s + ( f'{tfidf:.03f}' + ')' )

print( s )                         #  Print final doc's terms/TFIDF weights

# Print document similarity matrix

dense = xform.todense()

for i in range( len( dense ) ):
    s = 'Doc ' + str( i + 1 ) + ' sim: '
    x = dense[ i ].tolist()[ 0 ]
    
    s = s + '[  '
    for j in range( len( dense ) ):
        y = dense[ j ].tolist()[ 0 ]
        prod = numpy.multiply( x, y ).sum()
        s = s + f'{prod:.03f}' + '  '
    print( s + ']' )



Doc 1 TFIDF:  (thing,0.408) (better,0.408) (far,0.816)
Doc 2 TFIDF: (ishmael,1.000)
Doc 3 TFIDF: (dagger,1.000)
Doc 4 TFIDF: (happi,0.785) (dagger,0.619)
Doc 1 sim: [  1.000  0.000  0.000  0.000  ]
Doc 2 sim: [  0.000  1.000  0.000  0.000  ]
Doc 3 sim: [  0.000  0.000  1.000  0.619  ]
Doc 4 sim: [  0.000  0.000  0.619  1.000  ]


### Document Probability Matrices with LDA

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def display_doc_2_topic(doc_2_topic, collect):
    for i in range(0, len(collect)):
        topic_wt = list(doc_2_topic[i])
        idx = topic_wt.index(max(topic_wt))
        
        print(collect[i] + ":")
        print(f"  Concept {idx}, {topic_wt[ idx ] * 100.0:.02f}%")

def display_topics(model, feat_nm, top_word_n):
    for i, topic in enumerate(model.components_):
        print(f"Concept {i}:")
        topic_len = sum(topic)
        
        term = " ".join(
            [
                f"{feat_nm[i]} ({topic[i] / topic_len * 100.0:.02f}%); "
                for i in topic.argsort()[: -top_word_n - 1 : -1]
            ]
        )
        print("   " + term)

#  Mainline

collection = [
    "Romeo and Juliet",
    "Juliet, O happy dagger!",
    "Romeo died by a dagger",
    "'Live free or die', that's the New Hampshire motto",
    "Did you know that New Hampshire is in New England?",
]

feat_n = 10

#  Raw term counts for LDA

tf_vectorizer = CountVectorizer(
    max_df=0.95, min_df=2, max_features=feat_n, stop_words="english"
)
tf = tf_vectorizer.fit_transform(collection)
tf_feat_nm = tf_vectorizer.get_feature_names_out()

topic_n = 5
lda = LatentDirichletAllocation(
    n_components=topic_n,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)

lda_topic = lda.fit(tf)
doc_2_topic = lda.transform(tf)

top_word_n = 10
display_topics(lda, tf_feat_nm, top_word_n)

print()
display_doc_2_topic(doc_2_topic, collection)

Concept 0:
   juliet (28.13%);  dagger (22.75%);  romeo (19.99%);  hampshire (15.51%);  new (13.63%); 
Concept 1:
   new (29.75%);  hampshire (25.04%);  dagger (15.24%);  romeo (15.14%);  juliet (14.84%); 
Concept 2:
   romeo (26.75%);  dagger (25.34%);  new (18.02%);  hampshire (16.64%);  juliet (13.26%); 
Concept 3:
   hampshire (24.08%);  juliet (22.70%);  new (19.50%);  dagger (17.29%);  romeo (16.43%); 
Concept 4:
   dagger (22.53%);  hampshire (22.47%);  romeo (19.19%);  juliet (18.22%);  new (17.59%); 

Romeo and Juliet:
  Concept 0, 72.77%
Juliet, O happy dagger!:
  Concept 0, 72.83%
Romeo died by a dagger:
  Concept 2, 72.83%
'Live free or die', that's the New Hampshire motto:
  Concept 1, 72.91%
Did you know that New Hampshire is in New England?:
  Concept 1, 79.72%


### Word Embeddings

In [37]:
import spacy

nlp = spacy.load( 'en_core_web_md' )
doc = nlp( "wolverine" )

print( doc[ 0 ].text )
print( doc[ 0 ].vector )


wolverine
[-2.6735e+00  1.1196e+00 -3.3309e+00 -2.2347e+00  1.7461e+00  2.0495e+00
 -2.9675e+00 -1.0149e-01  2.0264e+00 -4.3162e-01  2.5480e+00 -1.1983e+00
 -1.7306e+00  3.9947e+00  7.5663e-01 -2.2848e+00  2.0478e+00 -1.6955e+00
 -5.8163e-01 -3.2606e+00  5.8465e-01  2.4206e+00 -1.4075e+00 -2.6414e+00
  5.6743e-01  1.3794e+00 -9.5934e-01 -2.0380e+00  5.9991e-01  7.2090e-01
 -8.3317e-01 -2.8623e+00 -6.4841e-01 -3.3744e+00 -4.4572e-02 -3.9805e+00
  1.0150e+00  1.6878e+00  1.7412e+00  9.1148e-01  2.0837e+00  3.0709e+00
  1.7426e+00  2.1406e+00 -2.1008e+00 -7.8994e-02  3.1308e+00 -2.4934e+00
  1.6135e+00 -6.5501e-01 -9.9574e-02  6.4580e+00  1.0512e+00 -1.0394e+00
 -3.4681e+00 -1.2427e+00 -9.3425e-01  1.6353e-01  2.9157e+00 -1.3834e+00
 -1.4721e+00 -2.3015e+00  7.9756e-03 -4.2218e-01 -9.7589e-01  2.0461e+00
 -2.5379e+00 -1.1676e+00  1.5319e+00 -2.5456e+00  4.1196e+00  1.3749e+00
  5.3369e-01  9.1249e-01 -1.4115e-01 -1.8317e+00 -4.2549e+00 -3.0651e-01
 -1.1053e+00 -2.1728e+00 -1.9436e+00 -1.9

### Named Entity Recognition


Text Preprocessing. Prepare the text for NER using standard text preprocessing operations like tokenization and part-of-speech tagging.

Entity Identification. Identify entities in the text.

Entity Classification. Classify identified entities.

Contextual Analysis. Use context to verify and correct entity classification, for example, does "apple" refer to a company or a fruit? Context may clarify this ambiguity.

In [38]:
import nltk
import spacy

#  Specify text to NER on

ex = """Deepak Jasani, Head of retal research, HDFC Securities, said: "Investors will look to the European CentralBank later Thursday for resassurance 
that surging prices are just transitory, and not about to spiral out of control. In addition to the ECB policy meeting, investors are awaiting a report 
later Thursday on US economic growth, which is likely to show a cooling recovery, as well as weekly jobs data."."""

#  spaCy
#  NLP process sentence, extract named entities from result

nlp = spacy.load("en_core_web_sm")
doc = nlp(ex)

print("spaCy:")
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}; ", end="")
print( "\n" )

#  NLTK
#  Tokenize and POS-tag tokens

tok = nltk.tokenize.word_tokenize(ex)
term_POS = nltk.tag.pos_tag(tok)

#  Convert POS-tagged tokens into named entities

NE_tree = nltk.ne_chunk(term_POS)

print("NLTK:")
for ent in NE_tree:
    if type(ent) == tuple:
        continue
    print(f"{ent[0][0]}: {ent._label}; ", end="")
print( "\n" )

spaCy:
Deepak Jasani: PERSON; HDFC Securities: ORG; the European CentralBank: ORG; later Thursday: DATE; ECB: ORG; later Thursday: DATE; US: GPE; weekly: DATE; 

NLTK:
Deepak: PERSON; Jasani: ORGANIZATION; HDFC: ORGANIZATION; European: ORGANIZATION; ECB: ORGANIZATION; US: GSP; 



### Sentiment

In [39]:
from sentiment_module import sentiment

term = 'happy'
sentiment.exist( term )
sentiment.sentiment( term )

{'valence': 8.21, 'arousal': 6.49}

In [42]:
from sentiment_module import sentiment

term = 'popsicle'
sentiment.exist( term )
term = 'enraged'
sentiment.exist( term )
sentiment.sentiment( term )


{'valence': 2.46, 'arousal': 7.97}

In [43]:

term_list = "it was the best of times it was the worst of times".split()
print(term_list)
sentiment.exist( term_list )
sentiment.sentiment( term_list )


['it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst', 'of', 'times']


{'valence': 5.0307617694606375, 'arousal': 4.939546556471719}

In [44]:

term_list = [ 'brocolli', 'carrot', 'pea' ]
sentiment.exist( term_list )
sentiment.sentiment( term_list )
sentiment.describe( 'interesting' )
sentiment.describe( 'pensive' )
sentiment.describe( [ 'quick', 'brown', 'fox', 'jumps', 'lazy', 'dog' ] )
sentiment.describe( 1, 9 )

'very nervous'

### Sentiment Analysis at the Sentence Level

In [45]:
import re
import nltk

nltk.download( 'vader_lexicon' )
from nltk.sentiment.vader import SentimentIntensityAnalyzer

txt = 'Two men, dressed in denim jackets and trousers and wearing "black, shapeless hats," walk single-file down a path near the pool. Both men carry blanket rolls  called bindles  on their shoulders. The smaller, wiry man is George Milton. Behind him is Lennie Small, a huge man with large eyes and sloping shoulders, walking at a gait that makes him resemble a huge bear. When Lennie drops near the pool\'s edge and begins to drink like a hungry animal, George cautions him that the water may not be good. This advice is necessary because Lennie is retarded and doesn\'t realize the possible dangers. The two are on their way to a ranch where they can get temporary work, and George warns Lennie not to say anything when they arrive. Because Lennie forgets things very quickly, George must make him repeat even the simplest instructions. Lennie also likes to pet soft things. In his pocket, he has a dead mouse which George confiscates and throws into the weeds beyond the pond. Lennie retrieves the dead mouse, and George once again catches him and gives Lennie a lecture about the trouble he causes when he wants to pet soft things (they were run out of the last town because Lennie touched a girl\'s soft dress, and she screamed). Lennie offers to leave and go live in a cave, causing George to soften his complaint and tell Lennie perhaps they can get him a puppy that can withstand Lennie\'s petting. As they get ready to eat and sleep for the night, Lennie asks George to repeat their dream of having their own ranch where Lennie will be able to tend rabbits. George does so and then warns Lennie that, if anything bad happens, Lennie is to come back to this spot and hide in the brush. Before George falls asleep, Lennie tells him they must have many rabbits of various colors.'

#  Convert to sentences, create VADER sentiment analyzer

sentence = txt.split( '.' )
sentiment = SentimentIntensityAnalyzer()

for i in range( 0, len( sentence ) ):

    # Print sentence's compound sentiment score
    
    score = sentiment.polarity_scores( sentence[ i ] )
    print( sentence[ i ] )
    print( 'Sentiment:', score[ 'compound' ] )



Two men, dressed in denim jackets and trousers and wearing "black, shapeless hats," walk single-file down a path near the pool
Sentiment: 0.0
 Both men carry blanket rolls  called bindles  on their shoulders
Sentiment: 0.0
 The smaller, wiry man is George Milton
Sentiment: 0.0
 Behind him is Lennie Small, a huge man with large eyes and sloping shoulders, walking at a gait that makes him resemble a huge bear
Sentiment: 0.5574
 When Lennie drops near the pool's edge and begins to drink like a hungry animal, George cautions him that the water may not be good
Sentiment: 0.0243
 This advice is necessary because Lennie is retarded and doesn't realize the possible dangers
Sentiment: -0.7845
 The two are on their way to a ranch where they can get temporary work, and George warns Lennie not to say anything when they arrive
Sentiment: -0.1027
 Because Lennie forgets things very quickly, George must make him repeat even the simplest instructions
Sentiment: 0.0
 Lennie also likes to pet soft thing

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/frederik.lindsey/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
