This notebook is designed to reproduce several findings from Emily Thornbury's<br>
chapter "The Poet Alone" in her book "Becoming a Poet in Anglo-Saxon England."<br>
<br>
In particular, see Fig 4.5 on page 170

## Python for Natural Language Processing
<li>Strings</li>
<li>Lists</li>
<li>String Methods</li>
<li>List Comprehension</li>

## NLP for Literary Study
<li>Word Frequencies</li>
<li>Visualization</li>
<li>Ad Hoc Stylometry</li>

# Building Intuition

In [None]:
# (don't worry about understanding everything here)
for line in open('lecture notes 09-22-15.txt'):
    for word in line.split():
        if word.endswith('ing'):
            print(word)

# Strings

In [None]:
"Hello, World!"

In [None]:
type("Hello, World!")

In [None]:
a = "Hello"
b = 'World'

In [None]:
print(a+b)
print(a*5)

In [None]:
#EX. Predict what will happend when we multiply two strings
a*b

# Lists

In [None]:
['Call', 'me', 'Ishmael']

In [None]:
type(['Call', 'me', 'Ishmael'])

In [None]:
list1 = ['Call', 'me', 'Ishmael']
list2 = ['In', 'the', 'beginning']
list3 = [1,3,5,7]

In [None]:
#EX. Predict what will happend when we perform the following operations
print(list1+list2)
print(list1*5)

In [None]:
sum(list3)

In [None]:
len(list1)

In [None]:
print(list1[0])
print(type(list1[0]))

In [None]:
print(list1[:2])
print(type(list1[:2]))

In [None]:
# EX. Concatenate the list1 and list2 into a single list.
# Retrieve the third element from the combined list.
# Retrieve the fourth through sixth elements from the combined list.

# String Methods

In [None]:
greeting = "Hello, World!"
print(greeting.split())

print(greeting.startswith('H'))
print(greeting.endswith('d'))
print()
print(greeting.isalpha())
print(greeting.isdigit())
print()
print(greeting.islower())
print(greeting.isupper())
print(greeting.istitle())

In [None]:
print(greeting.lower())
print(greeting.upper())
print(greeting.title())

In [None]:
greeting

In [None]:
greeting = greeting.lower()
print(greeting)

In [None]:
len(greeting)

In [None]:
greeting[:5]

In [None]:
# EX. Return the second through eighth characters in greeting
# Challenge: Return the characters from the first half of greeting

# List Comprehension

In [None]:
[word for word in list1 if word.istitle()]

In [None]:
# EX. Concatenate the list1 and list2 into a single list.
# Create a new list that contains only the words whose last letter is "e"
# Create a new list that contains the first letter of each word.
# Challenge: Create a new list that contains only words longer than two letters.

# Word Frequencies

In [None]:
with open('Melville - Moby Dick.txt', 'r') as file_in:
    moby_string = file_in.read()

In [None]:
moby_tokens = moby_string.split()

In [None]:
moby_tokens

In [None]:
# What might we do to clean up?

In [None]:
import string
moby_string = moby_string.lower()
moby_tokens = "".join([char for char in moby_string if char not in string.punctuation]).split()

In [None]:
moby_tokens

In [None]:
from collections import Counter
moby_dict = Counter(moby_tokens)

In [None]:
moby_dict

In [None]:
moby_dict.keys()

In [None]:
moby_dict.values()

In [None]:
moby_dict.most_common()

In [None]:
# EX. A common measure of lexical diversity for a given text is its Type-Token Ratio:
# the average number of times each word in a text gets used.
# Calculate the Type-Token Ratio for Moby Dick.

# Visualization

In [None]:
%pylab inline

In [None]:
common_keys = [x[0] for x in moby_dict.most_common()]
common_values = [x[1] for x in moby_dict.most_common()]

In [None]:
print(common_keys[:10])
print(common_values[:10])

In [None]:
word_count = sum([x for x in moby_dict.values()])
normed_values = [x[1]/word_count for x in moby_dict.most_common()]

In [None]:
figure(figsize = (10,10))
xticks(range(50), common_keys[:50], rotation='vertical')
plot(normed_values[:50])

In [None]:
cumulative_values = np.cumsum(normed_values)

In [None]:
figure(figsize = (10,10))
xticks(range(50), common_keys[:50], rotation='vertical')
plot(cumulative_values[:50])

In [None]:
# EX. Transform the script below into a list of words, then plot their frequencies.
# Note: A slash at the end of a line allows a string to continue unbroken onto the next

In [None]:
script = "Man: Well, what've you got? Waitress: Well, there's egg and bacon; egg sausage and bacon; \
egg and spam; egg bacon and spam; egg bacon sausage and spam; spam bacon sausage and spam; \
spam egg spam spam bacon and spam; spam sausage spam spam bacon spam tomato and spam; \
spam spam spam egg and spam; spam spam spam spam spam spam baked beans spam spam spam; \
...or Lobster Thermidor au Crevette with a Mornay sauce served in a Provencale manner with shallots \
and aubergines garnished with truffle pate, brandy and with a fried egg on top and spam."

# Ad Hoc Stylometry

In [None]:
with open('christ-and-satan.txt') as f:
    cs_text = f.read()

In [None]:
# This cell reproduces the series of operations performed above in the 'Visualization'
# section, but adds one new list comprehension that approximates an analysis of alliteration

cs_string = cs_text.lower()
cs_tokens = cs_string.split()
first_letters = [x[0] if x[0] not in ['a','e','i','o','u','y'] else 'a' for x in cs_tokens]
allit_dict = Counter(first_letters)
allit_freq = allit_dict.most_common()
common_keys = [x[0] for x in allit_freq]
common_values = [x[1] for x in allit_freq]
normed_values = [x[1]/sum(common_values) for x in allit_freq]
cumulative_values = np.cumsum(normed_values)

figure(figsize = (10,10))
xticks(range(len(common_keys)), common_keys, rotation='vertical')
plot(cumulative_values)

In [None]:
cs_fitts = cs_text.split('\n\n')

In [None]:
cs_fitts

In [None]:
figure(figsize = (10,10))
for i in range(12):
    cs_string = cs_fitts[i].lower()
    cs_tokens = cs_string.split()
    first_letters = [x[0] if x[0] not in ['a','e','i','o','u','y'] else 'a' for x in cs_tokens]
    allit_dict = Counter(first_letters)
    allit_freq = allit_dict.most_common()
    common_keys = [x[0] for x in allit_freq]
    common_values = [x[1] for x in allit_freq]
    normed_values = [x[1]/sum(common_values) for x in allit_freq]
    cumulative_values = np.cumsum(normed_values)
    xticks(range(4), ['1st','2nd','3rd','4th'], rotation='vertical')
    plot(cumulative_values[:4], color = plt.cm.bwr(i*.085), lw=3)
legend(labels=['Fitt '+str(i+1) for i in range(12)], loc=0)

In [None]:
def allit_plotter(text):
    from collections import Counter
    import numpy as np
    
    cs_string = text.lower()
    cs_tokens = cs_string.split()
    first_letters = [x[0] if x[0] not in ['a','e','i','o','u','y'] else 'a' for x in cs_tokens]
    allit_dict = Counter(first_letters)
    allit_freq = allit_dict.most_common()
    common_keys = [x[0] for x in allit_freq]
    common_values = [x[1] for x in allit_freq]
    normed_values = [x[1]/sum(common_values) for x in allit_freq]
    cumulative_values = np.cumsum(normed_values)
    xticks(range(4), ['1st','2nd','3rd','4th'], rotation='vertical')
    plot(cumulative_values[:4], color = plt.cm.bwr(i*.085), lw=3)

In [None]:
figure(figsize = (10,10))
for i in range(12):
    allit_plotter(cs_fitts[i])
legend(labels=['Fitt '+str(i+1) for i in range(12)], loc=0)

In [None]:
# EX. Modify the script to more accurately measure alliteration. Remember: Alliteration is 
# the repetition of a sound at the beginning of two or more words in the same line.

# EX. Thornbury had based part of her argument on the standard deviation of frequencies for
# each of the most common alliterative sounds. Write a script to measure that.