In [1]:
import pandas as pd

dataset = pd.read_csv('../data/Shakespeare_data.csv')

I hypothesized that certain characters might say things more fancily or in a more flowery way than other characters. I'm using the python library textstat to calculate the Flesch-Kinkaid Reading Ease rating for a particular PlayerLine. This rates a given text on a scale of 1-121.22, with higher scores meaning the text is less difficult to read, and lower scores meaning the text is more difficult to read. This might allow for a classifier to be more accurate, as scores might tend to cluster differently depending on the character speaking the line.

In [2]:
import textstat

dataset['dale_chall_readability'] = dataset.apply(lambda row: textstat.dale_chall_readability_score(row['PlayerLine']), axis=1)

dataset

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,dale_chall_readability
0,1,Henry IV,,,,ACT I,0.10
1,2,Henry IV,,,,SCENE I. London. The palace.,7.04
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ...",9.36
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",5.84
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",5.84
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,8.50
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,8.50
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,6.01
8,9,Henry IV,1.0,1.1.6,KING HENRY IV,Shall daub her lips with her own children's bl...,5.84
9,10,Henry IV,1.0,1.1.7,KING HENRY IV,"Nor more shall trenching war channel her fields,",7.98


Another interesting data point to look at might be the average length of words used in the line, as certain characters might have a propensity to use longer words.

In [6]:
def avg_word_length(text):
    text = [word.replace(".", "").replace(",", "").replace(";", "").replace(":","") for word in text.split()]
    text_lens = sum([len(word) for word in text])
    return text_lens / len(text)

dataset['avg_word_length'] = dataset.apply(lambda row: avg_word_length(row['PlayerLine']), axis=1)
dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,dale_chall_readability,avg_word_length
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",5.84,3.111111
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",5.84,3.666667
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,8.50,5.714286
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,8.50,4.571429
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,6.01,4.250000
8,9,Henry IV,1.0,1.1.6,KING HENRY IV,Shall daub her lips with her own children's bl...,5.84,4.555556
9,10,Henry IV,1.0,1.1.7,KING HENRY IV,"Nor more shall trenching war channel her fields,",7.98,5.000000
10,11,Henry IV,1.0,1.1.8,KING HENRY IV,Nor bruise her flowerets with the armed hoofs,6.01,4.750000
11,12,Henry IV,1.0,1.1.9,KING HENRY IV,"Of hostile paces: those opposed eyes,",9.20,5.000000
12,13,Henry IV,1.0,1.1.10,KING HENRY IV,"Which, like the meteors of a troubled heaven,",7.98,4.500000


One thing we might want to do first: remove the rows that aren't said by characters (but are stage directions or something else). Let's do that:

In [7]:
dataset = dataset[(dataset.PlayerLinenumber.notnull()) & (dataset.ActSceneLine.notnull()) & (dataset.Player.notnull())]
dataset

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,dale_chall_readability,avg_word_length
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",5.84,3.111111
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",5.84,3.666667
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,8.50,5.714286
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,8.50,4.571429
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,6.01,4.250000
8,9,Henry IV,1.0,1.1.6,KING HENRY IV,Shall daub her lips with her own children's bl...,5.84,4.555556
9,10,Henry IV,1.0,1.1.7,KING HENRY IV,"Nor more shall trenching war channel her fields,",7.98,5.000000
10,11,Henry IV,1.0,1.1.8,KING HENRY IV,Nor bruise her flowerets with the armed hoofs,6.01,4.750000
11,12,Henry IV,1.0,1.1.9,KING HENRY IV,"Of hostile paces: those opposed eyes,",9.20,5.000000
12,13,Henry IV,1.0,1.1.10,KING HENRY IV,"Which, like the meteors of a troubled heaven,",7.98,4.500000


Lets also convert as many columns as possible to float/integer values so we can use them as features in classification.

We can convert the plays to indices, and we can convert ActSceneLine into independent columns:

In [9]:
unique_plays = list(dataset.Play.unique())
def conv_to_index(play_name):
    return unique_plays.index(play_name)

dataset['Play'] = dataset['Play'].map(lambda x: conv_to_index(x))
dataset[['Act', 'Scene', 'Line']] = dataset['ActSceneLine'].str.split(pat='.', expand=True)
dataset
 

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,dale_chall_readability,avg_word_length,Act,Scene,Line
3,4,0,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",5.84,3.111111,1,1,1
4,5,0,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",5.84,3.666667,1,1,2
5,6,0,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,8.50,5.714286,1,1,3
6,7,0,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,8.50,4.571429,1,1,4
7,8,0,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,6.01,4.250000,1,1,5
8,9,0,1.0,1.1.6,KING HENRY IV,Shall daub her lips with her own children's bl...,5.84,4.555556,1,1,6
9,10,0,1.0,1.1.7,KING HENRY IV,"Nor more shall trenching war channel her fields,",7.98,5.000000,1,1,7
10,11,0,1.0,1.1.8,KING HENRY IV,Nor bruise her flowerets with the armed hoofs,6.01,4.750000,1,1,8
11,12,0,1.0,1.1.9,KING HENRY IV,"Of hostile paces: those opposed eyes,",9.20,5.000000,1,1,9
12,13,0,1.0,1.1.10,KING HENRY IV,"Which, like the meteors of a troubled heaven,",7.98,4.500000,1,1,10


For this dataset we will attempt to classify using a Random Forest Classifier. 

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

dataset.columns
x_dataset = dataset[['Play', 'PlayerLinenumber', 'dale_chall_readability', 'avg_word_length', 'Act', 'Scene', 'Line']]
y_dataset = dataset[['Player']]

X = x_dataset.to_numpy()
Y = y_dataset.to_numpy()

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
predictions = rfc.predict(X_test)
accuracy_score = accuracy_score(Y_test, predictions)
accuracy_score



  app.launch_new_instance()


0.6920736056297846

My accuracy score was 0.692, which is not horrible given that we trained on some peculiar data in average word length and Dale-Chall Score.