In [1]:
import numpy as np
import pandas as pd

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing

In [3]:
df = pd.read_csv('./Data/Shakespeare_data.csv')
print(df.head())


   Dataline      Play  PlayerLinenumber ActSceneLine         Player  \
0         1  Henry IV               NaN          NaN            NaN   
1         2  Henry IV               NaN          NaN            NaN   
2         3  Henry IV               NaN          NaN            NaN   
3         4  Henry IV               1.0        1.1.1  KING HENRY IV   
4         5  Henry IV               1.0        1.1.2  KING HENRY IV   

                                          PlayerLine  
0                                              ACT I  
1                       SCENE I. London. The palace.  
2  Enter KING HENRY, LORD JOHN OF LANCASTER, the ...  
3             So shaken as we are, so wan with care,  
4         Find we a time for frighted peace to pant,  


### Feature Engineering
<br>
<br>
We can see there are NaN values and we want to remove the rows containing them to make the data usable
<br>
We also see that PlayerLine will require natural language processing. We will remove PlayerLine to avoid this complexity
<br>
There could also be a bias on the speech patterns that could affect the model since the same person wrote eery line

In [4]:
df = df.dropna()

df = df.drop(columns = ['PlayerLine'])
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player
3,4,Henry IV,1.0,1.1.1,KING HENRY IV
4,5,Henry IV,1.0,1.1.2,KING HENRY IV
5,6,Henry IV,1.0,1.1.3,KING HENRY IV
6,7,Henry IV,1.0,1.1.4,KING HENRY IV
7,8,Henry IV,1.0,1.1.5,KING HENRY IV


We can drop dataline because it is the same as index
<br>
ActSceneLine has to be split into three seperate usable columns because its current value is not a usable number for a model
<br>
It can be dropped

In [5]:
df = df.drop(columns = ['Dataline'])

dfActSceneLine = df['ActSceneLine'].str.split(pat='.', n=2, expand=True)
df = df.drop(columns = ['ActSceneLine'])

df['Act'] = dfActSceneLine[0]
df['Scene'] = dfActSceneLine[1]
df['Line'] = dfActSceneLine[2]
df.head()

Unnamed: 0,Play,PlayerLinenumber,Player,Act,Scene,Line
3,Henry IV,1.0,KING HENRY IV,1,1,1
4,Henry IV,1.0,KING HENRY IV,1,1,2
5,Henry IV,1.0,KING HENRY IV,1,1,3
6,Henry IV,1.0,KING HENRY IV,1,1,4
7,Henry IV,1.0,KING HENRY IV,1,1,5


We want to predict Player by using Play, PlayerLinenumber, Act, Scene, and Line
<br>
To do this, we need numeric values for the model. But Players and Play are strings in our dataframe
<br><br>
Label encodig Player and Play solves this problem

In [11]:
label_encoder = preprocessing.LabelEncoder()
df['Play'] = label_encoder.fit_transform(df['Play'])
df['Player'] = label_encoder.fit_transform(df['Player'])
df.head()

Unnamed: 0,Play,PlayerLinenumber,Player,Act,Scene,Line
3,9,1.0,457,1,1,1
4,9,1.0,457,1,1,2
5,9,1.0,457,1,1,3
6,9,1.0,457,1,1,4
7,9,1.0,457,1,1,5


In [16]:
indepVars = df[['Play', 'PlayerLinenumber', 'Act', 'Scene', 'Line']]
depVars = df['Player']

In [17]:
# split into training set and test set
# train on 70 percent of the data and test on 30
X_train, X_test, Y_train, Y_test = train_test_split(indepVars, depVars, test_size=0.3, random_state=1)

First we will try a random forest

In [19]:
model = RandomForestClassifier(n_estimators=10)
model.fit(X_train, Y_train)
Y_predict = model.predict(X_test)
labeled_prediction = label_encoder.inverse_transform(Y_predict)
labeled_prediction

NameError: name 'le_player' is not defined

In [9]:
# create decision tree classifier
classifier = DecisionTreeClassifier()

# train classifier on test data
classifier = classifier.fit(X_train, Y_train)

# predict on test to see what percentage is correct
y_pred = classifier.predict(X_test)

In [10]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))

Accuracy: 0.7749001458188043
