In [30]:
import csv
import re
import glob
from collections import defaultdict 
from collections import OrderedDict as od
import math
from collections import Counter

## Examining N-Grams

Now that I know I can build a somewhat reasonable set of words, I'll ignore any words which have too low of an arousal log frequency (~3 for starters). Then I'll take a 5 gram around the word (2 either side) and track the frequency of each.

This had to get a little more complicated than hoped. It's not particularly easy to go to the next line. Instead, I store a copy of the previous line and any words that overflow and check in the next iteration.

In [101]:
f = 'dracula.txt'
samples.append(defaultdict(float))
ngrams = defaultdict(list)
wdracula = samples[-1]
n = 4 # either side of the word

# Load it in and collect frequency for each word
with open(f, mode='r') as infile:
    for line in infile:
        line = re.sub('[.,?!#$\'\"\(\)\d_;]', '', line)
        for word in line.split():
            word = word.lower()
            if word in warriner:
                wdracula[word] += 1

# Load it in again. If the word is viable, collect an N-Gram and store it

with open(f, mode='r') as infile:
    prev_line = ""
    word_cap = []
    for line in infile:
        line = re.sub('[.,?!#$\'\"\(\)\d_;]', '', line)
        line_split = line.split()
        
        while len(word_cap) > 0:
            word = word_cap.pop(0)
            ind = word_cap.pop(0)
            if ind < len(line_split):
                if line_split[ind] in warriner:
                    ngrams[word].append(line_split[ind])
        for i, word in enumerate(line_split):
            word = word.lower()
            if word in wdracula:
                if warriner[word]['arousal'] > 3: 
                    for j in range(1, n+1):
                        if i-j >= 0:
                            if line_split[i-j] in warriner and line_split[i-j] != word:
                                ngrams[word].append(line_split[i-j])
                        else:
                            if abs(i-j) < len(prev_line):
                                if prev_line != "" and prev_line[i-j] in warriner and prev_line[i-j] != word:
                                    ngrams[word].append(prev_line[i-j])
                        
                        if i+j < len(line_split):
                            if line_split[i+j] in warriner:
                                ngrams[word].append(line_split[i+j])
                        else:
                            word_cap.append(word)
                            word_cap.append(j-1)
        prev_line = line_split
                
wdracula['_valence'] = 0
wdracula['_dominance'] = 0
wdracula['_arousal'] = 0
count = 0
for word in wdracula:
    if word in warriner:
        count += wdracula[word]
        wdracula['_valence'] += warriner[word]['valence'] * wdracula[word]
        wdracula['_dominance'] += warriner[word]['dominance'] * wdracula[word]
        wdracula['_arousal'] += warriner[word]['arousal'] * wdracula[word]
wdracula['_valence'] /= count
wdracula['_dominance'] /= count
wdracula['_arousal'] /= count

In [102]:
ngram_table = {}
for word in ngrams:
    ngram_table[word] = Counter(ngrams[word])

Now we have a list of occurrences around the words we'd like and can look that up with something like `ngram_table['blood'].most_common()`.

Now what if we do our log frequency again and find the most arousing words, using the commonality as the frequency?

In [104]:
blood_common = ngram_table['blood'].most_common()
blood_common = sorted([(warriner[word[0]]['arousal'], word[0], word[1])  for word in blood_common], reverse=True)
blood_common

[(6.74, 'scream', 1),
 (6.44, 'lust', 1),
 (6.27, 'pain', 1),
 (6.01, 'poison', 2),
 (5.76, 'blood', 2),
 (5.71, 'operation', 2),
 (5.6, 'suck', 2),
 (5.59, 'trouble', 1),
 (5.59, 'mad', 1),
 (5.59, 'life', 5),
 (5.55, 'sheer', 1),
 (5.53, 'death', 1),
 (5.48, 'dance', 1),
 (5.43, 'lose', 2),
 (5.43, 'dog', 1),
 (5.4, 'shame', 1),
 (5.39, 'breast', 1),
 (5.29, 'want', 1),
 (5.24, 'smell', 1),
 (5.24, 'run', 2),
 (5.23, 'girl', 2),
 (5.2, 'loss', 3),
 (5.19, 'drink', 2),
 (5.14, 'strong', 2),
 (5.14, 'new', 1),
 (5.11, 'lost', 2),
 (5.09, 'children', 1),
 (5.07, 'heart', 2),
 (5.02, 'red', 1),
 (5.0, 'bright', 1),
 (4.95, 'brave', 2),
 (4.87, 'bloom', 3),
 (4.86, 'go', 1),
 (4.86, 'bad', 1),
 (4.82, 'present', 1),
 (4.81, 'mean', 1),
 (4.71, 'live', 1),
 (4.67, 'poor', 1),
 (4.67, 'drop', 3),
 (4.62, 'body', 1),
 (4.6, 'opening', 1),
 (4.59, 'face', 1),
 (4.57, 'give', 3),
 (4.52, 'take', 1),
 (4.52, 'pungent', 1),
 (4.52, 'crimson', 1),
 (4.5, 'thin', 3),
 (4.45, 'transfusion', 5),
 (4

In [77]:
prop = 'arousal'

[(w, wdracula[w], -1 if w not in warriner else warriner[w][prop]) for w in sorted(wdracula, key=lambda d: -1 if d not in warriner else warriner[d][prop], reverse=True)]

[('insanity', 2.0, 7.79),
 ('gun', 3.0, 7.74),
 ('sex', 1.0, 7.6),
 ('lover', 10.0, 7.45),
 ('aroused', 4.0, 7.3),
 ('thrill', 3.0, 7.19),
 ('scare', 2.0, 7.1),
 ('attack', 15.0, 7.05),
 ('intensity', 4.0, 7.05),
 ('exciting', 2.0, 6.95),
 ('spider', 5.0, 6.91),
 ('die', 43.0, 6.9),
 ('mortality', 1.0, 6.9),
 ('money', 19.0, 6.86),
 ('arrest', 1.0, 6.86),
 ('alarm', 3.0, 6.85),
 ('rich', 3.0, 6.81),
 ('kill', 16.0, 6.81),
 ('dangerous', 10.0, 6.81),
 ('tragedy', 3.0, 6.8),
 ('speed', 12.0, 6.8),
 ('pleasure', 16.0, 6.8),
 ('fatal', 6.0, 6.79),
 ('earthquake', 1.0, 6.76),
 ('lightning', 4.0, 6.75),
 ('ablaze', 1.0, 6.75),
 ('scream', 9.0, 6.74),
 ('hostility', 1.0, 6.74),
 ('celebrate', 1.0, 6.73),
 ('succeed', 3.0, 6.71),
 ('argue', 6.0, 6.67),
 ('adventurous', 1.0, 6.67),
 ('plague', 1.0, 6.67),
 ('rage', 7.0, 6.62),
 ('gruesome', 3.0, 6.62),
 ('laugh', 27.0, 6.62),
 ('excite', 4.0, 6.61),
 ('flame', 18.0, 6.6),
 ('chase', 1.0, 6.6),
 ('intense', 6.0, 6.6),
 ('surprise', 21.0, 6.57),


In [79]:
['insanity']

NameError: name 'ngrams_table' is not defined

## General Approach

Just taking the previous code and generalizing it a bit more to put in arbitrary works.

In [139]:
def get_breakdown(n, prop, min_thresh, filename):
    f = filename
    ngrams = defaultdict(list)
    text = defaultdict(float)

    # Load it in and collect frequency for each word
    with open(f, mode='r') as infile:
        for line in infile:
            line = re.sub('[.,?!#$\'\"\(\)\d_;]', '', line)
            for word in line.split():
                word = word.lower()
                if word in warriner:
                    text[word] += 1

    # Load it in again. If the word is viable, collect an N-Gram and store it

    with open(f, mode='r') as infile:
        prev_line = ""
        word_cap = []
        for line in infile:
            line = re.sub('[.,?!#$\'\"\(\)\d_;]', '', line)
            line_split = line.split()

            while len(word_cap) > 0:
                word = word_cap.pop(0)
                ind = word_cap.pop(0)
                if ind < len(line_split):
                    if line_split[ind] in warriner and line_split[ind] != word:
                        ngrams[word].append(line_split[ind])
            for i, word in enumerate(line_split):
                word = word.lower()
                if word in text:
                    if warriner[word][prop] > min_thresh: 
                        for j in range(1, n+1):
                            if i-j >= 0:
                                if line_split[i-j] in warriner and line_split[i-j] != word:
                                    ngrams[word].append(line_split[i-j])
                            else:
                                if abs(i-j) < len(prev_line):
                                    if prev_line != "" and prev_line[i-j] in warriner and prev_line[i-j] != word:
                                        ngrams[word].append(prev_line[i-j])

                            if i+j < len(line_split):
                                if line_split[i+j] in warriner:
                                    ngrams[word].append(line_split[i+j])
                            else:
                                word_cap.append(word)
                                word_cap.append(j-1)
            prev_line = line_split

    text['_valence'] = 0
    text['_dominance'] = 0
    text['_arousal'] = 0
    count = 0
    for word in text:
        if word in warriner:
            count += text[word]
            text['_valence'] += warriner[word]['valence'] * text[word]
            text['_dominance'] += warriner[word]['dominance'] * text[word]
            text['_arousal'] += warriner[word]['arousal'] * text[word]
    text['_valence'] /= count
    text['_dominance'] /= count
    text['_arousal'] /= count

    ngram_table = {}
    for word in ngrams:
        ngram_table[word] = Counter(ngrams[word])
        
    return (text, ngram_table)

In [133]:
get_breakdown(2, 'arousal', 3, 'dracula.txt')[0]['blood']

102.0

In [141]:
get_breakdown(2, 'arousal', 3, 'dracula.txt')[1]['blood'].most_common()

[('transfusion', 4),
 ('fresh', 4),
 ('bloom', 3),
 ('cold', 3),
 ('drop', 3),
 ('give', 2),
 ('four', 2),
 ('life', 2),
 ('run', 2),
 ('suck', 2),
 ('pool', 2),
 ('must', 2),
 ('loss', 2),
 ('baptism', 2),
 ('lost', 2),
 ('lose', 2),
 ('draw', 1),
 ('brave', 1),
 ('have', 1),
 ('stream', 1),
 ('dance', 1),
 ('side', 1),
 ('back', 1),
 ('lust', 1),
 ('want', 1),
 ('remembrance', 1),
 ('pain', 1),
 ('eyes', 1),
 ('children', 1),
 ('lead', 1),
 ('trance', 1),
 ('kin', 1),
 ('flesh', 1),
 ('poison', 1),
 ('take', 1),
 ('pure', 1),
 ('waste', 1),
 ('smell', 1),
 ('red', 1),
 ('time', 1),
 ('lay', 1),
 ('clot', 1),
 ('come', 1),
 ('dog', 1),
 ('remove', 1),
 ('keep', 1),
 ('mean', 1),
 ('make', 1),
 ('blood', 1)]

In [146]:
sorted([(w, warriner[w]['arousal']) for w in get_breakdown(2, 'arousal', 3, 'dracula.txt')[1]['blood']], key=lambda d: d[1], reverse=True)

[('lust', 6.44),
 ('pain', 6.27),
 ('poison', 6.01),
 ('blood', 5.76),
 ('suck', 5.6),
 ('life', 5.59),
 ('dance', 5.48),
 ('dog', 5.43),
 ('lose', 5.43),
 ('want', 5.29),
 ('smell', 5.24),
 ('run', 5.24),
 ('loss', 5.2),
 ('lost', 5.11),
 ('children', 5.09),
 ('red', 5.02),
 ('brave', 4.95),
 ('bloom', 4.87),
 ('mean', 4.81),
 ('drop', 4.67),
 ('give', 4.57),
 ('take', 4.52),
 ('transfusion', 4.45),
 ('stream', 4.35),
 ('remembrance', 4.35),
 ('clot', 4.17),
 ('flesh', 4.11),
 ('must', 4.1),
 ('pure', 4.05),
 ('waste', 4.04),
 ('trance', 4.0),
 ('baptism', 3.96),
 ('lead', 3.95),
 ('lay', 3.7),
 ('make', 3.67),
 ('pool', 3.65),
 ('draw', 3.6),
 ('come', 3.57),
 ('cold', 3.55),
 ('have', 3.52),
 ('keep', 3.43),
 ('time', 3.41),
 ('four', 3.39),
 ('eyes', 3.18),
 ('side', 3.14),
 ('remove', 3.11),
 ('kin', 3.0),
 ('back', 2.59),
 ('fresh', 2.35)]

## Simple Interface, Examples

Below is a very simplified interface for playing with these various things and playing with the learned concepts

In [186]:
def top_list(breakdown, prop):
    return sorted([(warriner[w][prop], w, breakdown[0][w]) for w in breakdown[0] if w in warriner], reverse=True)
def related(breakdown, prop, keyword):
    if keyword in breakdown[1]:
        return sorted([(w, warriner[w][prop]) for w in breakdown[1][keyword]], key=lambda d: d[1], reverse=True)
    else:
        return []

In [202]:
titles = ['holmes', 'alice', 'dracula', 'pride', 'raven']
texts = {}
for title in titles:
    texts[title] = get_breakdown(2, 'arousal', 1, title + ".txt")

In [203]:
search = 'sad'
print 'Keyword: ', search

for text in texts:
    print text, '\n', related(texts[text], 'arousal', search), '\n'

Keyword:  sad
holmes 
[('tragedy', 6.8), ('anxious', 6.2), ('news', 4.61), ('look', 3.76)] 

dracula 
[('accident', 4.93), ('broken', 4.86), ('news', 4.61), ('world', 4.55), ('blow', 4.48), ('terrible', 4.39), ('little', 4.2), ('case', 4.14), ('feel', 4.05), ('distress', 4.0), ('toll', 3.95), ('truth', 3.88), ('experience', 3.71), ('be', 3.43), ('hour', 3.38), ('humble', 3.18), ('will', 2.9), ('slow', 2.89), ('accept', 2.8)] 

pride 
[('affair', 5.4), ('omen', 4.52), ('business', 3.71), ('cousin', 2.6)] 

alice 
[('lonely', 4.37), ('distance', 3.81)] 

raven 
[('fancy', 5.42), ('uncertain', 4.45), ('silken', 2.7)] 



In [204]:
search = 'anger'
print 'Keyword: ', search

for text in texts:
    print text, '\n', related(texts[text], 'arousal', search), '\n'

Keyword:  anger
holmes 
[('surprise', 6.57), ('fear', 6.14), ('burst', 5.09), ('fit', 4.35), ('would', 3.81)] 

dracula 
[('arouse', 6.21), ('sheer', 5.55), ('hellish', 4.48), ('stamp', 3.45)] 

pride 
[('compassion', 4.5), ('treatment', 4.47), ('think', 3.75), ('be', 3.43), ('pale', 3.18)] 

alice 
[] 

raven 
[] 



In [211]:
print '\n Valence\n----------\n'

for text in texts:
   print text, ': ', texts[text][0]['_valence']

print '\n Dominance\n----------\n'

for text in texts:
   print text, ': ', texts[text][0]['_dominance']

print '\n Arousal\n----------\n'

for text in texts:
   print text, ': ', texts[text][0]['_arousal']


 Valence
----------

holmes :  5.67676919194
dracula :  5.67917595918
pride :  5.88645457006
alice :  5.76507903056
raven :  5.36217712177

 Dominance
----------

holmes :  5.5983186096
dracula :  5.55997188622
pride :  5.72432852987
alice :  5.6260424507
raven :  5.42151291513

 Arousal
----------

holmes :  3.93258284354
dracula :  3.9690989416
pride :  3.99732765036
alice :  3.91886045461
raven :  4.02664206642
