In [154]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

wordcloud from : https://www.kaggle.com/kingburrito666/data-wordcloud-shakespeares-most-sentimental

### Reading the dataset

In [155]:
all_data = pd.read_csv("../input/shakespeare-plays/Shakespeare_data.csv", sep=",")

In [156]:
wrdcld = pd.Series(all_data['PlayerLine'].tolist()).astype(str)

In [157]:
mask = np.array(Image.open("../input/shakespeare-plays/william-shakespeare-black-silhouette.jpg"))

### Creating a wordcloud

In [158]:
from wordcloud import WordCloud

cloud = WordCloud(mask=mask, margin=0, max_font_size=125).generate(' '.join(wrdcld.astype(str)))
plt.figure(figsize=(20,15))
plt.imshow(cloud)
plt.axis('off')
plt.show()

### Replacing None in Player with Other

In [159]:
all_data['Player'].replace(np.nan, 'Other', inplace=True)

all_data.head(10)

In [160]:
len(all_data)

In [161]:
all_data.groupby('Play').count()

In [162]:
play_data = all_data.groupby('Play').count().sort_values(by='PlayerLine', ascending=False)['PlayerLine']
play_data


In [163]:
play_data = play_data.to_frame()
play_data

In [164]:
play_data.index = np.arange(0, len(play_data))

In [165]:
play_data = all_data.groupby('Play').count().sort_values(by='PlayerLine',ascending=False)['PlayerLine']
play_data = play_data.to_frame()
play_data['Play'] = play_data.index.tolist()
play_data.index = np.arange(0,len(play_data)) #changing the index from plays to numbers
play_data.columns =['Lines','Play']

In [166]:
number_players = all_data.groupby(['Play'])['Player'].nunique().sort_values(ascending=False).to_frame()

In [167]:
number_players['Play'] = number_players.index.tolist()
number_players.columns = ['NumPlayers', 'Play']

number_players.index= np.arange(0, len(number_players))

number_players

In [168]:
plt.figure(figsize=(10,10))
ax = sns.barplot(x= 'NumPlayers',y='Play',data=number_players )
ax.set(xlabel="NUmber of players", ylabel='Play Name')
#play_data.index= np.arange(0,len(numberPlayers)))
plt.show()

### TEXT preprocessing on entire data

In [169]:
full_text = "\n".join(all_data.PlayerLine)
print(len(full_text))

first lets convert everything to small case and ignore punctuations.

In [170]:
counter = 1
for i in all_data.PlayerLine:
    print(i)
    print(re.sub('[^A-Za-z]+', ' ', i).strip().lower())
    if counter>5:
        break
    counter+=1

In [171]:
import collections
import re

In [172]:
# ignoring punctuation and captalisation

def ignore_punctuation(text_array):
    return [re.sub('[^A-Za-z]+', ' ', line ).strip().lower() for line in text_array]

lines = ignore_punctuation(all_data.PlayerLine)

In [173]:
print(lines[0])
print(lines[1])

### Tokenization 

Each text sequence is split into tokens

In [174]:
def tokenize(lines, token='word'):
    if token=='word':
        return [line.split(' ') for line in lines]
    
    elif token=='char':
        return [list(line) for line in lines]
    
    else :
        print('ERROR: unknown type: ' + token)

tokens = tokenize(lines)

In [175]:
for i in range(5):
    print(tokens[i])

### Creating Vocabulary

The string type of the token is inconvenient to be used by models, which take numerical inputs.
Now let us build a dictionary, often called vocabulary as well, to map string tokens into numerical
indices starting from 0. To do so, we first count the unique tokens in all the documents from the
training set, namely a corpus, and then assign a numerical index to each unique token according to
its frequency. Rarely appeared tokens are often removed to reduce the complexity. Any token that
does not exist in the corpus or has been removed is mapped into a special unknown token < unk >.
We optionally add a list of reserved tokens, such as < pad > for padding, < bos > to present the
beginning for a sequence, and < eos > for the end of a sequence.


In [176]:
def count_corpus(tokens):
    token_dict = {}
    
    for token in tokens:
        for i in token:
            if i not in token_dict.keys():
                token_dict[i] = 1
            else:
                token_dict[i] += 1
    
    #print(token_dict)
    
    if len(tokens)==0 or isinstance(tokens[0], list):
        print("tokens needs flattening")
        tokens = [token for line in tokens for token in line ]
    new_dict = collections.Counter(tokens)
    
    #print(new_dict)
    return new_dict

In [177]:
count_corpus(tokens[:5])

In [178]:
class Vocab:
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens==None:
            tokens=[]
        if reserved_tokens==None:
            reserved_tokens=[]
        
        counter = count_corpus(tokens)
        
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=1)
        
        self.idx_to_token = ['<unk>'] + reserved_tokens
        
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
        
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) -1 
        
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem(token) for token in tokens]
    
    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
    
    def unk(self):
        return 0
    
    def token_freqs(self):
        return self.token_freqs

In [179]:
vocab = Vocab(tokens)

In [180]:
print(list(vocab.token_to_idx.items())[:10])

In [181]:
corpus = [vocab[token] for line in tokens for token in line]

In [182]:
print("Total length of corpus: ", len(corpus))
print("Vocab size of shakespeare: ", len(vocab))