# CSCI E-81 HW5 - Partners: Piyawan Chirayus and Cindy Liu

In [1]:
import re
import requests  # pip install requests
from bs4 import BeautifulSoup

%matplotlib inline
import numpy as np
import scipy as sp
import pandas as pd

from sklearn import cluster
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import MDS

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split



In [2]:
BASE_URL = 'http://shakespeare.mit.edu/'

r = requests.get(BASE_URL)
assert(r.status_code == 200)

soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find_all('table')[1]
titles = {}
for link in table.find_all('a'):
    if 'Poetry' not in link.get('href'):
        if '\n' in link.string:
            link.string = link.string.replace('\n', ' ').strip()
        href = link.get('href').replace('index', 'full')
        send_url = BASE_URL + href
        print('Getting {} from {}'.format(link.string, send_url))
        r = requests.get(send_url)
        assert(r.status_code == 200)
        titles[link.string] = {
            'href': href,
            'soup': BeautifulSoup(r.content, 'html.parser')
        }

Getting All's Well That Ends Well from http://shakespeare.mit.edu/allswell/full.html
Getting As You Like It from http://shakespeare.mit.edu/asyoulikeit/full.html
Getting The Comedy of Errors from http://shakespeare.mit.edu/comedy_errors/full.html
Getting Cymbeline from http://shakespeare.mit.edu/cymbeline/full.html
Getting Love's Labours Lost from http://shakespeare.mit.edu/lll/full.html
Getting Measure for Measure from http://shakespeare.mit.edu/measure/full.html
Getting The Merry Wives of Windsor from http://shakespeare.mit.edu/merry_wives/full.html
Getting The Merchant of Venice from http://shakespeare.mit.edu/merchant/full.html
Getting A Midsummer Night's Dream from http://shakespeare.mit.edu/midsummer/full.html
Getting Much Ado About Nothing from http://shakespeare.mit.edu/much_ado/full.html
Getting Pericles, Prince of Tyre from http://shakespeare.mit.edu/pericles/full.html
Getting Taming of the Shrew from http://shakespeare.mit.edu/taming_shrew/full.html
Getting The Tempest from 

In [291]:
COLUMN_NAMES = ['Title','Act','Scene','Speaker','Words']


def add_data(df, title, act, scene, speaker, words):
    if speaker in df['Speaker'].values:
        index = (df['Speaker'] == speaker).argmax()
        if df['Title'][index] == title and df['Act'][index] == act and df['Scene'][index] == scene:
            df.set_value(index, 'Words', df['Words'][index] + ' ' + words)
            return df
    new_row = pd.DataFrame([[title, act, scene, speaker, words]], columns=COLUMN_NAMES)
    df = df.append(new_row, ignore_index=True)
    return df


data = pd.DataFrame([], columns=COLUMN_NAMES)
for title in titles:
    print(title)
    act = titles[title]['soup'].find('h3')
    scene = speaker = words = ''
    if act.string[:3] != 'ACT':
        scene = act.string.split('.')[0]
        act.string = 'None'
    for elem in act.next_elements:
        if elem.name == 'h3':
            if words:
                data = add_data(data, title, act.string, scene, speaker, words)
                words = ''
            if elem.string[:3] == 'ACT':
                act = elem
            else:
                scene = elem.string.split('.')[0]
        if elem.name == 'a':
            if elem.attrs['name'][:6] == 'speech':
                if words:
                    data = add_data(data, title, act.string, scene, speaker, words)
                    words = ''
                speaker = elem.string.lower()
            else:
                if words:
                    words += ' ' + elem.string.strip()
                else:
                    words += elem.string.strip()

Coriolanus
Measure for Measure
Richard II
Cymbeline
King Lear
Timon of Athens
All's Well That Ends Well
The Tempest
Two Gentlemen of Verona
Henry VI, part 3
Henry VIII
Richard III
Romeo and Juliet
The Merry Wives of Windsor
Hamlet
Henry IV, part 2
As You Like It
Henry IV, part 1
Taming of the Shrew
Troilus and Cressida
Titus Andronicus
Love's Labours Lost
Othello
The Merchant of Venice
Pericles, Prince of Tyre
Henry V
Macbeth
A Midsummer Night's Dream
Henry VI, part 2
Henry VI, part 1
Winter's Tale
King John
Julius Caesar
The Comedy of Errors
Much Ado About Nothing
Twelfth Night
Antony and Cleopatra


In [269]:
ref = pd.read_table('Shakespeare_characters.txt', header=0, encoding='latin-1')
del ref['Unnamed: 1']
ref = ref.set_value((ref['Speaker'] == 'Lafew').argmax(), 'Speaker', 'Lafeu')
ref.head()

Unnamed: 0,Speaker,Gender,NumLines,Play,TopVillain,Fools
0,Hamlet,Male,1506,Hamlet,,
1,Iago,Male,1088,Othello,yes,
2,King Henry,Male,1031,Henry V,,
3,Othello,Male,880,Othello,,
4,Timon,Male,850,Timon of Athens,,


In [292]:
data.head()

Unnamed: 0,Title,Act,Scene,Speaker,Words
0,Coriolanus,ACT I,SCENE I,first citizen,"Before we proceed any further, hear me speak. ..."
1,Coriolanus,ACT I,SCENE I,all,"Speak, speak. Resolved. resolved. We know't, w..."
2,Coriolanus,ACT I,SCENE I,second citizen,"One word, good citizens. Would you proceed esp..."
3,Coriolanus,ACT I,SCENE I,menenius,"What work's, my countrymen, in hand? where go ..."
4,Coriolanus,ACT I,SCENE I,marcius,"Thanks. What's the matter, you dissentious rog..."


In [350]:
print(data.shape)
print(ref.shape)

(25085, 5)
(1533, 6)


In [349]:
missing_from_mine = []
missing_from_ref = []
have = data['Speaker'].unique()
check = ref['Speaker'].unique()
check = [x.lower() for x in check]
for x in check:
    if x not in have:
        missing_from_mine.append(x)
for x in have:
    if x not in check:
        missing_from_ref.append(x)
print('Missing from mine:', len(missing_from_mine))
print('Missing from ref:', len(missing_from_ref))

Missing from mine: 296
Missing from ref: 178
