# CSCI E-81 HW5 - Partners: Piyawan Chirayus and Cindy Liu

In [484]:
import re
import requests  # pip install requests
from bs4 import BeautifulSoup

%matplotlib inline
import numpy as np
import scipy as sp
import pandas as pd

from sklearn import cluster
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import MDS

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split

VERBOSE = True

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cindy.liu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<a id='top'></a>

## Table of Contents
- [Loading data](#loading-data)
- [Data processing](#data-processing)
- [Stemming](#stemming)
- [Feature extraction using TF-IDF](#tf-idf)
- [MDS](#mds)

<a id='loading-data'></a>
[back to top](#top)

## Loading data

In [478]:
ref = pd.read_table('Shakespeare_characters.txt', header=0, encoding='latin-1')
del ref['Unnamed: 1']
ref = ref.set_value((ref.Speaker == 'Lafew').argmax(), 'Speaker', 'Lafeu')
ref.head()

Unnamed: 0,Speaker,Gender,NumLines,Play,TopVillain,Fools
0,Hamlet,Male,1506,Hamlet,,
1,Iago,Male,1088,Othello,yes,
2,King Henry,Male,1031,Henry V,,
3,Othello,Male,880,Othello,,
4,Timon,Male,850,Timon of Athens,,


In [461]:
BASE_URL = 'http://shakespeare.mit.edu/'

r = requests.get(BASE_URL)
assert(r.status_code == 200)

soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find_all('table')[1]
titles = {}
num_titles = 0
if not VERBOSE:
    print('Getting documents', end='')
for link in table.find_all('a'):
    if 'Poetry' not in link.get('href'):
        if '\n' in link.string:
            link.string = link.string.replace('\n', ' ').strip()
        href = link.get('href').replace('index', 'full')
        send_url = BASE_URL + href
        if VERBOSE:
            print('Getting {} from {}'.format(link.string, send_url))
        else:
            print('.', end='')
        num_titles += 1
        r = requests.get(send_url)
        assert(r.status_code == 200)
        titles[link.string] = {
            'href': href,
            'soup': BeautifulSoup(r.content, 'html.parser')
        }
print('Done! Retrieved {} documents.'.format(num_titles))

Getting documents.....................................Done! Retrieved 37 documents.


<a id='data-processing'></a>
[back to top](#top)

## Processing raw documents into data frame

In [462]:
COLUMN_NAMES = ['Title','Act','Scene','Speaker','Words']


def add_data(df, title, act, scene, speaker, words):
    if speaker in df.Speaker.values:
        indices = list(df[df.Title == title][df.Act == act][df.Scene == scene][df.Speaker == speaker].index)
        if len(indices) == 1:
            index = indices[0]
            df.set_value(index, 'Words', df.Words[index] + ' ' + words)
            return df
        elif len(indices) > 1:
            print(title, act, scene, speaker, indices)
            return df
    new_row = pd.DataFrame([[title, act, scene, speaker, words]], columns=COLUMN_NAMES)
    df = df.append(new_row, ignore_index=True)
    return df


data = pd.DataFrame([], columns=COLUMN_NAMES)
num_processed = 0

if not VERBOSE:
    print('Processing documents', end='')
for title in titles:
    if VERBOSE:
        print(title)
    else:
        print('.', end='')
    act = titles[title]['soup'].find('h3')
    scene = speaker = words = ''
    if act.string[:3] != 'ACT':
        scene = act.string.split('.')[0]
        act.string = 'None'
    for elem in act.next_elements:
        if elem.name == 'h3':
            if words:
                data = add_data(data, title, act.string, scene, speaker, words)
                words = ''
            if elem.string[:3] == 'ACT':
                act = elem
            else:
                scene = elem.string.split('.')[0]
        if elem.name == 'a':
            if elem.attrs['name'][:6] == 'speech':
                if words:
                    data = add_data(data, title, act.string, scene, speaker, words)
                    words = ''
                speaker = elem.string.lower()
            else:
                if words:
                    words += ' ' + elem.string.strip()
                else:
                    words += elem.string.strip()
    num_processed += 1
print('Done! Processed {} documents.'.format(num_processed))

Processing documents.



....................................Done! Processed 37 documents.


In [479]:
data.Words = data.Words.str.replace("[^a-zA-Z]", " ")
data.Words = data.Words.str.lower()
print('DataFrame dimensions:', data.shape)
print('Reference dimensions:', ref.shape)
data.head(10)

DataFrame dimensions: (3974, 5)
Reference dimensions: (1533, 6)


Unnamed: 0,Title,Act,Scene,Speaker,Words
0,Coriolanus,ACT I,SCENE I,first citizen,before we proceed any further hear me speak ...
1,Coriolanus,ACT I,SCENE I,all,speak speak resolved resolved we know't w...
2,Coriolanus,ACT I,SCENE I,second citizen,one word good citizens would you proceed esp...
3,Coriolanus,ACT I,SCENE I,menenius,what work's my countrymen in hand where go ...
4,Coriolanus,ACT I,SCENE I,marcius,thanks what's the matter you dissentious rog...
5,Coriolanus,ACT I,SCENE I,messenger,where's caius marcius the news is sir the v...
6,Coriolanus,ACT I,SCENE I,first senator,marcius 'tis true that you have lately told u...
7,Coriolanus,ACT I,SCENE I,cominius,you have fought together it is your former pr...
8,Coriolanus,ACT I,SCENE I,titus,no caius marcius i'll lean upon one crutch a...
9,Coriolanus,ACT I,SCENE I,sicinius,was ever man so proud as is this marcius when...


<a id='stemming'></a>
[back to top](#top)

## Stemming

In [473]:
porter = PorterStemmer()

def tokenizer_porter(text):
    #for word in text.split():
    #    print(word, porter.stem(word))
    return [porter.stem(word) for word in text.split()]

def tokenizer_vanilla(text):
    return [word for word in text.split()]

<a id='tf-idf'></a>
[back to top](#top)

## Feature extraction using TF-IDF

In [485]:
all_stopwords = stopwords.words('english')
vectorizer = TfidfVectorizer(tokenizer = tokenizer_vanilla,
                             stop_words=all_stopwords,
                             use_idf=False,
                             max_features = 100,ngram_range=(1,1))
train_data_features = vectorizer.fit_transform(data.Words.values)
train_data_features = train_data_features.A #toarray()  
vocab = vectorizer.get_feature_names()

In [490]:
df = pd.DataFrame(data=train_data_features,columns=vocab)
print(df.shape)
df.head()

(3974, 100)


Unnamed: 0,','tis,art,away,ay,bear,better,blood,brother,call,...,true,two,upon,us,way,well,whose,world,would,yet
0,0.0,0.0,0.0,0.0,0.061663,0.061663,0.0,0.0,0.0,0.0,...,0.061663,0.0,0.0,0.678289,0.0,0.308313,0.0,0.0,0.184988,0.123325
1,0.0,0.0,0.0,0.516398,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.267261,0.0,0.0,0.0,0.267261,0.0
3,0.211604,0.0,0.070535,0.0,0.0,0.0,0.0,0.141069,0.0,0.0,...,0.070535,0.0,0.070535,0.0,0.141069,0.282138,0.070535,0.0,0.0,0.141069
4,0.0,0.0,0.082761,0.0,0.0,0.0,0.0,0.0,0.0,0.082761,...,0.0,0.0,0.331042,0.0,0.0,0.082761,0.082761,0.082761,0.496564,0.0


<a id='mds'></a>
[back to top](#top)

## MDS

In [491]:
CS_dist = 1 - cosine_similarity(train_data_features)
Euclid_dist = euclidean_distances(train_data_features)
print(Euclid_dist.shape, CS_dist.shape)

(3974, 3974) (3974, 3974)


In [357]:
missing_from_mine = []
missing_from_ref = []
have = data['Speaker'].unique()
check = ref['Speaker'].unique()
check = [x.lower() for x in check]
for x in check:
    if x not in have:
        missing_from_mine.append(x)
for x in have:
    if x not in check:
        missing_from_ref.append(x)
print('Missing from mine:', len(missing_from_mine))
print('Missing from ref:', len(missing_from_ref))

Missing from mine: 296
Missing from ref: 178
