# Shakespeare's plays

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#NLP libraries
import spacy
from empath import Empath

nlp = spacy.load('en_core_web_sm')

## Read the data

In [None]:
# read the data
DATA = "../data/"
df = pd.read_csv(DATA + "Shakespeare_data.csv")
df = df.drop(columns=['Dataline'])

df = df.dropna()

In [None]:
df.head()

In [None]:
df.info()

## Clean the dataframe

In [None]:
# clean dataframe
df = df[df.Player.notnull()]

# get the individual act, scene and line numbers from the ActSceneLine column
df['Act'] = [act.split(".")[0] if pd.notnull(act) else act for act in df.ActSceneLine]
df['Scene'] = [act.split(".")[1] if pd.notnull(act) else act for act in df.ActSceneLine]
df['Line'] = [act.split(".")[2] if pd.notnull(act) else act for act in df.ActSceneLine]

df['Act'] = pd.to_numeric(df['Act'], errors='coerce')
df['Scene'] = pd.to_numeric(df['Scene'], errors='coerce')
df['Line'] = pd.to_numeric(df['Line'], errors='coerce')

df['Player'] = [player.capitalize() for player in df.Player]

# plays of interest
plays = ['Hamlet', 'Othello', 'Macbeth', 'King Lear', 'Romeo and Juliet', 
         'A Midsummer nights dream', 'Julius Caesar', 'Merchant of Venice']
plays = [play.capitalize() for play in plays]

# filter the selected plays in the dataframe
df['Play'] = [play.capitalize() for play in df.Play]
df = df[df['Play'].isin(plays)]

df

In [None]:
# creates one dataframe for each play
Hamlet = df[(df["Play"] == "Hamlet")]
King_Lear = df[(df["Play"] == "King Lear")]
Julius_Caesar = df[(df["Play"] == "Julius Caesar")]
macbeth = df[(df["Play"] == "macbeth")]
Merchant_of_Venice = df[(df["Play"] == "Merchant of Venice")]
A_Midsummer_nights_dream = df[(df["Play"] == "A Midsummer nights dream")]
Othello = df[(df["Play"] == "Othello")]
Romeo_and_Juliet = df[(df["Play"] == "Romeo and Juliet")]

In [None]:
# check for null values
df.isnull().sum()

## Analysis of the plays

### List all the players per play

In [None]:
# create a dictionnary containing for each Shakespeare's play, a list of all the players
players_per_play = {shakespeare_play: df[df.Play == shakespeare_play].Player.unique().tolist() for shakespeare_play in df.Play.unique()}
players_per_play

### Number of players in each play

In [None]:
numberPlayers = df.groupby(['Play'])['Player'].nunique().sort_values(ascending= False).to_frame().reset_index()
numberPlayers = numberPlayers.rename(columns={"Player": "Num Players"})
numberPlayers

plt.figure(figsize=(10,10))
ax = sns.barplot(x='Num Players',y='Play',data=numberPlayers)
ax.set(xlabel='Number of Players', ylabel='Play Name')
plt.show()

### Number of lines for each play

In [None]:
Play_Count = df['Play'].value_counts()
Play_Count.plot(kind='barh',label='Play Count',figsize=(8,9),colormap='plasma')

#### Number of lines for each player

In [None]:
Player_Count = df['Player'].value_counts()[:30]
Player_Count.plot(kind='barh',label='Player Count Top 30',figsize=(8,10),colormap='plasma')

In [None]:
plt.rcParams['figure.figsize']=(12.5,5)
ax = sns.barplot(x='Play',y='PlayerLinenumber',data = df)
plt.setp(ax.get_xticklabels(), rotation=90)

### Get the name of the players mentioned in every line of the play

In [None]:
def line_analysis(line_series, players_per_play):
    ''' Analyze each line of the play: get the number of words and detect if other players are mentioned in the line.'''
    
    line = line_series.PlayerLine
    
    # tokenize the line into words and remove the punctuation
    words = [word.strip(string.punctuation) for word in line.split()]
    
    # get the number of words in a line
    nb_words = len(words)
    line_series['NbWords'] = nb_words
    
    # get the players' names for this play
    list_of_players = players_per_play[line_series.Play]
    
    # get the names of the players mentioned in the line
    mentioned_players = [word.capitalize() for word in words if word.capitalize() in list_of_players]
    # get the number of players mentioned in the line
    nb_mentioned_players = len(mentioned_players)
    
    line_series['NbMentionedPlayers'] = nb_mentioned_players
    line_series['MentionedPlayer'] = mentioned_players
    
    return (line_series)

In [None]:
# keep only lines where a player talks (remove scene description lines)
mentions = df.apply(lambda x: line_analysis(x, players_per_play), axis=1)

# add new columns to main dataframe
df['NbWords'] = mentions['NbWords']
df['NbMentionedPlayers'] = mentions['NbMentionedPlayers']
df['MentionedPlayer'] = mentions['MentionedPlayer']
df.loc[df['ActSceneLine'].isnull(), 'NbMentionedPlayers'] = None
df.loc[df['ActSceneLine'].isnull(), 'MentionedPlayer'] = None

mentions = mentions.explode('MentionedPlayer')
mentions = mentions[mentions['ActSceneLine'].notnull() 
                    & mentions['MentionedPlayer'].notnull()][['Play', 'ActSceneLine', 'Player', 'MentionedPlayer']]
mentions

In [None]:
df.head(5)

### Find the first and last players speaking in a scene

In [None]:
def find_players(x):
    ''' Finds the first and last players speaking in a scene '''
    
    first_player = x[x.Line == 1]['Player'].values[0]
    last_player = x[x.Line == x.Line.max()]['Player'].values[0]
    
    return pd.Series(data = {'first_player': first_player, 'last_player': last_player})

In [None]:
# first and last player speaking in a scene
df.groupby(['Play', 'Act', 'Scene']).apply(find_players)

### Find the most talkative player in each scene

In [None]:
# get the number of words of each player in each scene
talkative_df = df.groupby(['Play', 'Act', 'Scene', 'Player'])['NbWords'].sum().reset_index() 
# get player that said the most words in each scene 
talkative_df = talkative_df.groupby(['Play', 'Act', 'Scene'])[['Player', 'NbWords']].max().reset_index()
talkative_df

## Topic detection

In [None]:
def empath_analysis(s, categories, nlp):
    ''' gets a series type object containing a line of a play and does a topic detection for that line'''
    
    # analyze line
    doc = nlp(s['PlayerLine'])
    # empath analysis on line
    empath_features = lexicon.analyze(doc.text,categories = categories)
    
    return pd.Series(empath_features)

In [None]:
lexicon = Empath()

#categories = ["love", "betrayal", "loyalty", "revenge", "family", "power", "guilt", "fate", "fortune", 
#              "poverty", "forgiveness", "reconciliation", "jealousy", "war", "corruption", "good", "evil"]
categories = ["love", "family", "power", "war"]

semantic_df = df.copy()
semantic_df = semantic_df[semantic_df['ActSceneLine'].notnull()] # make sure it is not a description line
semantic_df[categories] = semantic_df.apply(lambda x: empath_analysis(x, categories, nlp), axis=1) # topic detection

semantic_df

### Detect how the topics evolve between acts for each play

In [None]:
# topic detection for each play - for each act
play_topics = semantic_df.copy()
# get the number of topic that occured for each act
play_topics = play_topics.groupby(['Play', 'Act'])[['NbWords'] + categories].sum().reset_index()
# normalize the number of topics by the number of words per act
play_topics.loc[:, categories] = play_topics[categories].div(play_topics['NbWords'], axis=0)
play_topics

In [None]:
plot_df = pd.melt(play_topics, id_vars=['Play', 'Act'], 
                  value_vars=categories, var_name='topic', value_name='value')
plot_df['Act'] = plot_df['Act'].apply(lambda x: str(int(x)))

sns.relplot(
    data=plot_df, x="Act", y="value", col="Play",
    hue="topic", style="topic", kind="line", col_wrap=3
)

### Detect how the topics evolve between different plays

In [None]:
sns.relplot(
    data=plot_df, x="Act", y="value", col="topic",
    hue="Play", style="Play", kind="line", col_wrap=2
)

### Detect how the topics evolve for each player of each play

In [None]:
# topic detection for each play - for each player

play_topics = semantic_df.copy()
# get the number of topic that occured for each act
play_topics = play_topics.groupby(['Play', 'Player'])[['NbWords'] + categories].sum().reset_index()
# normalize the number of topics by the number of words per act
play_topics.loc[:, categories] = play_topics[categories].div(play_topics['NbWords'], axis=0)
play_topics

In [None]:
# topic detection for each play - for each act - for each player

play_topics = semantic_df.copy()
# get the number of topic that occured for each act
play_topics = play_topics.groupby(['Play', 'Act', 'Player'])[['NbWords'] + categories].sum().reset_index()
# normalize the number of topics by the number of words per act
play_topics.loc[:, categories] = play_topics[categories].div(play_topics['NbWords'], axis=0)
play_topics

In [None]:
plot_df = pd.melt(play_topics.reset_index(), id_vars=['Play', 'Act', 'Player'], 
                  value_vars=categories, var_name='topic', value_name='value')

sns.relplot(
    data=plot_df, x="Act", y="value", col="Player",
    hue="topic", style="topic", kind="line", col_wrap=4
)

### Wordclouds

In [None]:
all_word = pd.Series(df['PlayerLine'].tolist()).astype(str)
word = pd.Series(df['PlayerLine'].tolist()).astype(str)

Hamlet_word = pd.Series(Hamlet['PlayerLine'].tolist()).astype(str)
King_Lear_word = pd.Series(King_Lear['PlayerLine'].tolist()).astype(str)
Julius_Caesar_word = pd.Series(Julius_Caesar['PlayerLine'].tolist()).astype(str)
macbeth_word = pd.Series(macbeth['PlayerLine'].tolist()).astype(str)
Merchant_of_Venice_word = pd.Series(Merchant_of_Venice['PlayerLine'].tolist()).astype(str)
A_Midsummer_nights_dream_word = pd.Series(A_Midsummer_nights_dream['PlayerLine'].tolist()).astype(str)
Othello_word = pd.Series(Othello['PlayerLine'].tolist()).astype(str)
Romeo_and_Juliet_word = pd.Series(Romeo_and_Juliet['PlayerLine'].tolist()).astype(str)

Creating a WordCloud to see which words appear the most in each play

In [None]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS

# create stopword list
stopwords = set(STOPWORDS)
stopwords.add("come")
stopwords.add("will")
stopwords.add("O")
stopwords.add("let")
stopwords.add("thou")
stopwords.add("thy")
stopwords.add("now")
stopwords.add("know")
stopwords.add("well")
stopwords.add("thus")
stopwords.add("thee")
stopwords.add("go")
stopwords.add("say")
stopwords.add("yet")
stopwords.add("upon")
stopwords.add("hath")
stopwords.add("tis")
stopwords.add("make")
stopwords.add("see")
stopwords.add("may")
stopwords.add("must")
stopwords.add("give")
stopwords.add("much")
stopwords.add("one")
stopwords.add("take")

play_name = df['Play'].unique().tolist()
for play in play_name:
    word = pd.Series(df[df["Play"] == play]["PlayerLine"].tolist()).astype(str)
    cloud = WordCloud(margin=0, stopwords= stopwords, max_font_size=125).generate(' '.join(word.astype(str)))
    print(play)
    plt.figure(figsize=(20,15)) 
    plt.imshow(cloud)
    plt.axis('off')
    plt.show()

## Sentiment Analysis