# To be, or not to be

Dalton Hahn (2762306)

## Shakespearean Play Data
https://www.kaggle.com/kingburrito666/shakespeare-plays/download

## Goal
- Perform feature engineering upon the data to enable classification
- Build a classification model in order to be able to determine the character in the play based on other columns in the dataset

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import math
from statistics import mean, stdev
import string

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

## Read in the Data

In [2]:
df = pd.read_csv("../data/external/Shakespeare_data.csv")

In [3]:
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


## First Glance
Looking through the data, we have some cleaning to do.
- Sometimes there is no player or dialogue (stage directions)
- Punctuation in the "PlayerLine", want words only
- Separate out the "ActSceneLine" column to its individual components
- Create a dictionary of plays with a mapping of player to their respective lines

In [4]:
# Remove NaN rows from the dataset (these represent stage directions/non-dialogue)
print("With NaNs = ", df.shape)
df = df.dropna()
print("Without NaNs = ", df.shape)
df.head()

With NaNs =  (111396, 6)
Without NaNs =  (105152, 6)


Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil


In [5]:
# Remove punctuation from the PlayerLine column

clean_lines = []

for row in df["PlayerLine"]:
    clean_lines.append(row.translate(str.maketrans('','', string.punctuation)))
    #print(row.translate(str.maketrans('','',string.punctuation)))
    
df.insert(len(df.columns), 'CleanLines', clean_lines, True)
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,CleanLines
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",So shaken as we are so wan with care
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",Find we a time for frighted peace to pant
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,And breathe shortwinded accents of new broils
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,To be commenced in strands afar remote
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,No more the thirsty entrance of this soil


In [6]:
# Separate out the ActSceneLine column

acts = []
scenes = []
lines = []

for row in df["ActSceneLine"]:
    #print(row)
    
    act = row.split('.')[0]
    acts.append(act)
    #print(act)
    
    scene = row.split('.')[1]
    scenes.append(scene)
    #print(scene)
    
    line = row.split('.')[2]
    lines.append(line)
    #print(line)

In [7]:
# Remove old ActSceneLine column and add the new individual columns
df.insert(len(df.columns), 'Act', acts, True)
df.insert(len(df.columns), 'Scene', scenes, True)
df.insert(len(df.columns), 'Line', lines, True)
df = df.drop(axis=1, columns='ActSceneLine')

df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,Player,PlayerLine,CleanLines,Act,Scene,Line
3,4,Henry IV,1.0,KING HENRY IV,"So shaken as we are, so wan with care,",So shaken as we are so wan with care,1,1,1
4,5,Henry IV,1.0,KING HENRY IV,"Find we a time for frighted peace to pant,",Find we a time for frighted peace to pant,1,1,2
5,6,Henry IV,1.0,KING HENRY IV,And breathe short-winded accents of new broils,And breathe shortwinded accents of new broils,1,1,3
6,7,Henry IV,1.0,KING HENRY IV,To be commenced in strands afar remote.,To be commenced in strands afar remote,1,1,4
7,8,Henry IV,1.0,KING HENRY IV,No more the thirsty entrance of this soil,No more the thirsty entrance of this soil,1,1,5


In [8]:
# Translating Plays and Player into numeric values

# Print the length of unique players in the player column
print("Unique players = ", len(df.Player.unique()))

# Set players to be a series of category
unique_name_codes = pd.Series(df["Player"], dtype="category")

# Replace the string label of a player's name with a numeric code
unique_name_codes = unique_name_codes.cat.codes

# Print the largest value present in unique players
print(max(unique_name_codes))

# Add the name codes to the dataframe
df.insert(len(df.columns), 'PlayerCode', unique_name_codes, True)


# Print the length of unique plays in the plays column
print("Unique Plays = ", len(df.Play.unique()))

# Set plays to be a series of category
unique_play_codes = pd.Series(df["Play"], dtype="category")

# Replace the string label of a play's name with a numeric code
unique_play_codes = unique_play_codes.cat.codes

# print the largest value present in unique plays
print(max(unique_play_codes))

# add the play codes to the dataframe
df.insert(len(df.columns), 'PlayCode', unique_play_codes, True)


df.head()

Unique players =  934
933
Unique Plays =  36
35


Unnamed: 0,Dataline,Play,PlayerLinenumber,Player,PlayerLine,CleanLines,Act,Scene,Line,PlayerCode,PlayCode
3,4,Henry IV,1.0,KING HENRY IV,"So shaken as we are, so wan with care,",So shaken as we are so wan with care,1,1,1,457,9
4,5,Henry IV,1.0,KING HENRY IV,"Find we a time for frighted peace to pant,",Find we a time for frighted peace to pant,1,1,2,457,9
5,6,Henry IV,1.0,KING HENRY IV,And breathe short-winded accents of new broils,And breathe shortwinded accents of new broils,1,1,3,457,9
6,7,Henry IV,1.0,KING HENRY IV,To be commenced in strands afar remote.,To be commenced in strands afar remote,1,1,4,457,9
7,8,Henry IV,1.0,KING HENRY IV,No more the thirsty entrance of this soil,No more the thirsty entrance of this soil,1,1,5,457,9


## Now that we have all features in terms of numeric values, let's try to throw them at a random forest classifier
- Following https://www.datacamp.com/community/tutorials/random-forests-classifier-python for inspiration

In [None]:
le = preprocessing.LabelEncoder()
df["PlayerLine"] = le.fit_transform(df["PlayerLine"])
df["CleanLines"] = le.fit_transform(df["CleanLines"])


X=df[['PlayerLinenumber', 'CleanLines', 'Act', 'Scene', 'Line', 'PlayCode']]  # Features
y=df['Player']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 80% training and 20% test

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=250, max_features=3)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))