In [78]:
## Extract type token ratio, average word length, average sentence length,Coleman–Liau index  and other features 
## Refer to the Readability_Features folder, Column_Denote.txt
import pandas as pd
import numpy as np
import re

from nltk.corpus import gutenberg
from nltk.tokenize import wordpunct_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer

## tokenize, stem, remove stopwords
def tokenStem(words):
    words = words.strip('[').strip(']').lower() #remove brackets and lowercase
    words = re.sub('[(){}<>\'"]', '', words)
    stemmer = PorterStemmer()
#     stops = stopwords.words('english')
    output = [stemmer.stem(token) for token in wordpunct_tokenize(words) ] #stem words
    return "".join(words) #merge into strings

#import file
df = pd.read_csv('Data/all_plays.txt',sep='\t')
# tokenize, stem, and remove stopwords
df['speech'] = df['speech'].map(lambda x: tokenStem(x))

# Get all the play names
playnames = pd.unique(df.playname.ravel())

# df['speech'] = df['speech'].map(lambda x: tokenStem(x))
# # Array to hold bag of words for each play
play_bagwords = []
act_bagwords = []
scene_bagwords = []

## loop through all plays
for i in range(len(playnames)):
    
    ### Create bag of words for all plays
    p = df[df['playname'] == playnames[i]] # Get the sub data frame of each play
    s = "" # Initiate empty string to hold bag of words for play

    # Iterate all the rows to append the speech and speaker words to a string
    for index,row in p.iterrows():
        s += str(row['speaker'])
        s += str(row['speech'])
    
    # Append the bag of words to each play
    play_bagwords.append(s)
   
    ### Create bag of words for all the acts in each play
    s = "" # Initiate empty string to hold bag of words for acts
    acts = pd.unique(p.act.ravel()) # Get the number of acts and scenes
    for j in range(len(acts)):
        
        a = p[p['act'] == acts[j]] #Get the current act
        # Array to hold bag of words for each bag
#         act_bagwords = []
         # Iterate all the rows to append the speech and speaker words to a string
        for index,row in a.iterrows():
            s += str(row['speaker'])
            s += str(row['speech'])
        
         # Append the bag of words to each act within the play
        act_bagwords.append(s) 
        s = ""
        scenes = pd.unique(a.scene.ravel()) # Get the number of acts and scenes
        # Array to hold bag of words for each scene
#         scene_bagwords = []
        for z in range(len(scenes)):
        
            sc = a[a['scene'] == scenes[z]] #Get the current act
        
             # Iterate all the rows to append the speech and speaker words to a string
            for index,row in sc.iterrows():
                s += str(row['speaker'])
                s += str(row['speech'])
    
            # Append the bag of words to each act within the play
            scene_bagwords.append(s)  
            

In [103]:
## Initiate arrays to hold stats about the works
num_sentences = [] 
num_words = []
num_letters = []
avglen_words = []
avglen_sentences = []
CLI_score = []
# count_less5letter_words = []


# Loop through all scenes
for x in scene_bagwords: 
    x = re.sub('[(){}<>\'?,!:;"]', '.', x) # Replace all signs with .
    x = x.split('.') # Split into sentences
    num_sentence = len(x) # Get the number of sentences for this play
    num_word = 0
    num_letter = 0
    cli = 0
    for y in x:
        y = wordpunct_tokenize(y)
        num_word += len((y)) # Accumulate the number of words for this sentence
        for z in y:
            num_letter += len(z) 
    cli = 5.88*num_letter/num_word - 29.6*num_sentence/num_word - 15.8
    num_letters.insert(-1,num_letter) # Insert  num_letters into the array
    num_words.insert(-1,num_word) # Insert  num_words into the array
    num_sentences.insert(-1,num_sentence) # Insert  num_sentences into the array
    avglen_words.insert(-1,num_letter/num_word)
    avglen_sentences.insert(-1,num_word/num_sentence)
    CLI_score.insert(-1,cli)
stats_feature = [num_letters,num_words,num_sentences,CLI_score,avglen_words,avglen_sentences]
# print(num_sentences,num_words,num_letters,CLI_score)
stats_feature = np.array(stats_feature)
stats_feature = stats_feature.T
np.savetxt("Scene_style_feature.txt", stats_feature, delimiter=",")

#     # print(play_ttratio)
# np.savetxt("Act_TTratio_.txt", act_ttratio, delimiter=",")
# # Split to each sentence
# e = d.split('.')
# # print(e)
# print(e[1])
# # Get the number of sentences in this corpos
# print(len(e)-1)
# ## Get the length(number of words) of the sentence 1
# print(len(wordpunct_tokenize(e[1])))
# ## Split the sentence into words
# f = wordpunct_tokenize(e[1])
# print (len(f[0]))
# # f = wordpunct_tokenize(e[1])
# # print(f)
# # print(len(f[0]))

In [75]:
act_ttratio = []

for x in act_bagwords:
    total_words = len(wordpunct_tokenize(x))  # Get the number of words used
    total_vocab = len(set(wordpunct_tokenize(x))) # Get the number of unique vocabs used
    act_ttratio.insert(-1,total_vocab/total_words) # Insert the ttratio 
# print(play_ttratio)
np.savetxt("Act_TTratio_.txt", act_ttratio, delimiter=",")

In [70]:
## Array to hold the token type ratio, the higher the ratio, the richer the vocab is
play_ttratio = []

for x in play_bagwords:
    total_words = len(wordpunct_tokenize(x))  # Get the number of words used
    total_vocab = len(set(wordpunct_tokenize(x))) # Get the number of unique vocabs used
    play_ttratio.insert(-1,total_vocab/total_words) # Insert the ttratio 
print(play_ttratio)
np.savetxt("Play_TTratio_.txt", play_ttratio, delimiter=",")

[0.32183206106870227, 0.3335540254237288, 0.28943293267458153, 0.34536033914272257, 0.3243269894487723, 0.32161446443247194, 0.29911154985192495, 0.33028418099768264, 0.3273753527751646, 0.33730337078651684, 0.32380379175443874, 0.35039941902687, 0.3116771831424788, 0.34145325838017904, 0.3451912082327531, 0.2961689999161707, 0.30061543317026984, 0.2946845354615269, 0.3125292557341239, 0.32078204199855176, 0.29053335359367877, 0.28449299453811444, 0.2904374364191251, 0.3102593440122044, 0.28514318348698703, 0.28450218745857087, 0.3299655200064149, 0.31422986708365913, 0.3146608014296515, 0.3053809270111881, 0.30109042355908316, 0.3716646989374262, 0.31064487273318697, 0.2794516303926884, 0.3412844036697248, 0.29697603651578547, 0.3207441860465116]
