In [1]:
names_master_list = [
    "Bilbo", "Bungo", "Belladonna", "Gandalf", "Radagast", "Thorin", "Balin", "Dwalin", "Kili", "Fili", "Dori", "Nori", "Ori", "Oin", "Gloin", "Bifur", "Bofur", "Bombur", "Elrond", "Thranduil", "Bard", "Beorn", "Smaug", "Gollum", "Bolg", "RoÃ¤c", "Tom", "Bert", "William",
    "Frodo", "Samwise", "Sam", "Meriadoc", "Merry", "Peregrin", "Pippin", "Rosie", "Fatty", "Saruman", "Aragorn", "Boromir", "Faramir", "Denethor", "Theoden", "Eomer", "Eowyn", "Grima", "Wormtongue" "Hama", "Hirgon", "Imrahil", "Beregond", "Bergil", "Halbarad", "Elladan", "Elrohir", "Legolas", "Galadriel", "Celeborn", "Arwen", "Glorfindel", "Haldir", "Dain", "Smjagol", "Treebeard", "Quickbeam", "Huorns", "Shelob", "Sauron", "Nazgul", "Grishnakh", "Ugluk", "Shagrat", "Gorbag", "Shadowfax", "Bill", "Snowmane", "Hasufel", "Arod", "Tom", "Goldberry", "Barliman", "Butterbur",

    "Lucy", "Edmund", "Susan", "Peter", "Aslan", "Tumnus", "Beaver", "Maugrim", "Kirke", "Macready", "Oreius", "Otmin",
    "Caspian", "Cornelius", "Trufflehunter", "Trumpkin", "Reepicheep", "Nikabrik", "Glenstorm", "Pattertwig", "Miraz", "Prunaprismia", "Glozelle", "Sopespian",

    "Mary", "Poppins", "George", "Winifred", "Jane", "Michael", "John", "Barbara", "Banks", "Brill", "Ellen", "Robertson", "Bert", "Boom", "Lark", "Andrew", "Corry", "Neleus", "Albert", "Arthur",

    "Robert", "Virginia", "Kathy", "Neville", "Ruth", "Ben", "Josephine", "Soames"
]


In [2]:
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from functools import reduce
import statistics

In [3]:
import glob
import os

# Create Tolkien Data Frame

file_list = glob.glob(os.path.join("Tolkien", "*", "*.txt"))
basename = "Tolkien"
data = []
for f in file_list:
    book = os.path.basename(os.path.dirname(f))   # folder name = book title
    chapter = os.path.basename(f).replace(".txt", "")
    text = open(f, "r", encoding="utf-8").read()
    data.append({"book": book, "chapter": chapter, "raw_text": text})

Tolkien_df = pd.DataFrame(data)

# Create C.S. Lewis Data Frame

file_list = glob.glob(os.path.join("CS_Lewis", "*", "*.txt"))
basename = "CS_Lewis"
data = []
for f in file_list:
    book = os.path.basename(os.path.dirname(f))   # folder name = book title
    chapter = os.path.basename(f).replace(".txt", "")
    text = open(f, "r", encoding="utf-8").read()
    data.append({"book": book, "chapter": chapter, "raw_text": text})

Lewis_df = pd.DataFrame(data)

# Create P.L. Travers Data Frame

file_list = glob.glob(os.path.join("PL_Travers", "*", "*.txt"))
basename = "PL_Travers"
data = []
for f in file_list:
    book = os.path.basename(os.path.dirname(f))   # folder name = book title
    chapter = os.path.basename(f).replace(".txt", "")
    text = open(f, "r", encoding="utf-8").read()
    data.append({"book": book, "chapter": chapter, "raw_text": text})

Travers_df = pd.DataFrame(data)

# Create Matheson Data Frame

file_list = glob.glob(os.path.join("Matheson", "*", "*.txt"))
basename = "Matheson"
data = []
for f in file_list:
    book = os.path.basename(os.path.dirname(f))   # folder name = book title
    chapter = os.path.basename(f).replace(".txt", "")
    text = open(f, "r", encoding="utf-8").read()
    data.append({"book": book, "chapter": chapter, "raw_text": text})

Matheson_df = pd.DataFrame(data)


In [4]:
def clean_text(text):
    text = text.lower()
    text = text.replace("\n", " ")
    text = re.sub(r'chapter\s+\w+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r"'s$", "", text)
    pattern = r"\b(" + "|".join(re.escape(name) for name in names_master_list) + r")\b"
    text = re.sub(pattern, "X", text, flags=re.IGNORECASE)
    return text.strip()

In [5]:
Tolkien_df["clean_text"] = Tolkien_df["raw_text"].apply(clean_text)
Lewis_df["clean_text"] = Lewis_df["raw_text"].apply(clean_text)
Travers_df["clean_text"] = Travers_df["raw_text"].apply(clean_text)
Matheson_df["clean_text"] = Matheson_df["raw_text"].apply(clean_text)


In [6]:
def remove_stops_and_punct(text):
    # Tokenize and lowercase
    tokens = [t.lower() for t in word_tokenize(text)]
    
    # Remove punctuation
    tokens = [t for t in tokens if t not in string.punctuation]
    
    # Remove stopwords
    english_stops = stopwords.words("english")
    tokens = [t for t in tokens if t not in english_stops]

    # Join into a string
    return " ".join(tokens)

In [7]:
# Bag of words
Tolkien_df["words"] = Tolkien_df["clean_text"].apply(remove_stops_and_punct)
Lewis_df["words"] = Lewis_df["clean_text"].apply(remove_stops_and_punct)
Travers_df["words"] = Travers_df["clean_text"].apply(remove_stops_and_punct)
Matheson_df["words"] = Matheson_df["clean_text"].apply(remove_stops_and_punct)

In [8]:
def only_stops(text):
    # Tokenize and lowercase
    tokens = [t.lower() for t in word_tokenize(text)]
    
    # Keep only stopwords
    english_stops = stopwords.words("english")
    tokens = [t for t in tokens if t in english_stops]

    # Join into a string
    return " ".join(tokens)

In [9]:
# Only stopwords
Tolkien_df["stops"] = Tolkien_df["clean_text"].apply(only_stops)
Lewis_df["stops"] = Lewis_df["clean_text"].apply(only_stops)
Travers_df["stops"] = Travers_df["clean_text"].apply(only_stops)
Matheson_df["stops"] = Matheson_df["clean_text"].apply(only_stops)

In [10]:
def punct(text):
    # Tokenize and lowercase
    tokens = [t.lower() for t in word_tokenize(text)]
    
    # Keep only punctuation
    tokens = [t for t in tokens if t in string.punctuation]
    
    # Join into a string
    return " ".join(tokens)

In [11]:
# Only punctuation
Tolkien_df["punct"] = Tolkien_df["clean_text"].apply(punct)
Lewis_df["punct"] = Lewis_df["clean_text"].apply(punct)
Travers_df["punct"] = Travers_df["clean_text"].apply(punct)
Matheson_df["punct"] = Matheson_df["clean_text"].apply(punct)

In [12]:
# Unique word count
def unique_word_ratio(text_df, text):
    this_df = text_df[text_df["book"] == text]
    this_full_text = " ".join(this_df["words"])
    return len(set(this_full_text.split())) / len(this_full_text.split())
hobbit_unique_word_ratio = unique_word_ratio(Tolkien_df, "Hobbit")
print(hobbit_unique_word_ratio)

0.1382625415137214


In [13]:
# Average sentence length
def avg_sent_len(text_df, text):
    this_df = text_df[text_df["book"] == text]
    tokenized_text = word_tokenize(" ".join(this_df["clean_text"]))
    sent_lengths = []
    length = 0
    for token in tokenized_text:
        if token != ".":
            length += 1
        else:
            sent_lengths.append(length)
            length = 0
    avg_len = 0
    for i in sent_lengths:
        avg_len += i
    avg_len /= len(sent_lengths)
    avg_len = round(avg_len, 2)
    return [avg_len, max(sent_lengths)]

hobbit_avg_len = avg_sent_len(Tolkien_df, "Hobbit")
print(hobbit_avg_len)

[23.72, 214]


In [14]:
# Most frequent words
def most_frequent_words(text_df, text):
    this_df = text_df[text_df["book"] == text]
    this_full_text = word_tokenize(" ".join(this_df["words"]))
    this_words_freq = FreqDist(this_full_text).most_common(50)
    return this_words_freq
hobbit_mfw = most_frequent_words(Tolkien_df, "Hobbit")
print(hobbit_mfw)

[('``', 2781), ('x', 1710), ('said', 568), ('could', 368), ("'s", 328), ('one', 284), ('dwarves', 275), ('came', 244), ('would', 239), ("n't", 236), ('like', 221), ('long', 215), ('time', 214), ('back', 208), ('come', 197), ('great', 179), ('still', 173), ('good', 168), ('see', 161), ('go', 161), ('went', 161), ('little', 160), ('last', 160), ('far', 159), ('goblins', 153), ('way', 151), ('even', 143), ('dark', 141), ('got', 140), ('get', 136), ('soon', 135), ('well', 134), ('hobbit', 133), ('mountain', 131), ('many', 127), ('away', 126), ('made', 122), ('thought', 121), ('light', 116), ('old', 114), ('though', 112), ('never', 111), ('us', 111), ('ever', 111), ('round', 108), ('know', 108), ('must', 106), ('much', 105), ('door', 100), ('going', 100)]


In [15]:
# Most frequent stopwords
def most_frequent_stopwords(text_df, text):
    this_df = text_df[text_df["book"] == text]
    this_stops = word_tokenize(" ".join(this_df["stops"]))
    this_stops_freq = FreqDist(this_stops).most_common(50)
    return this_stops_freq
hobbit_mfs = most_frequent_stopwords(Tolkien_df, "Hobbit")
print(hobbit_mfs)

[('the', 5755), ('and', 4260), ('of', 2341), ('to', 2015), ('a', 1878), ('he', 1850), ('in', 1399), ('was', 1328), ('they', 1324), ('it', 1186), ('that', 1020), ('had', 898), ('his', 877), ('i', 829), ('you', 820), ('on', 745), ('not', 726), ('for', 703), ('as', 665), ('were', 654), ('all', 641), ('at', 635), ('with', 623), ('but', 622), ('them', 524), ('there', 513), ('their', 508), ('is', 493), ('have', 437), ('him', 434), ('from', 393), ('be', 361), ('up', 360), ('out', 350), ('or', 344), ('we', 325), ('are', 307), ('down', 306), ('if', 306), ('no', 296), ('what', 295), ('now', 293), ('so', 284), ('by', 278), ('do', 266), ('this', 262), ('when', 262), ('then', 253), ('very', 246), ('into', 234)]


In [16]:
# Most frequent punctuation
def most_frequent_punct(text_df, text):
    this_df = text_df[text_df["book"] == text]
    this_punct = word_tokenize(" ".join(this_df["punct"]))
    this_punct_freq = FreqDist(this_punct).most_common(25)
    return this_punct_freq
hobbit_mfp = most_frequent_punct(Tolkien_df, "Hobbit")
print(hobbit_mfp)

[(',', 5455), ('.', 4332), ('!', 992), (';', 593), ('?', 312), (':', 164), ('(', 116), (')', 116), ("'", 89), ('-', 23), ('*', 6), ('&', 1), ('}', 1)]


In [17]:
# Create an object type to represent the data from each book
class Book:
  def __init__(self, author_df, book):
    self.title = book
    self.vocab_richness = unique_word_ratio(author_df, book)
    self.sentence_length = avg_sent_len(author_df, book)
    self.most_common_words = most_frequent_words(author_df, book)
    self.most_common_stopwords = most_frequent_stopwords(author_df, book)
    self.most_common_punct = most_frequent_punct(author_df, book)

  def get_data(self):
    return([self.vocab_richness, self.sentence_length, self.most_common_words, self.most_common_stopwords, self.most_common_punct])


In [18]:
class Author:
    def __init__(self, name, books):
        self.name = name
        self.books = books
        self.vocab_richness = sum(book.vocab_richness for book in books) / len(books)
        self.sentence_length = sum(book.sentence_length[0] for book in books) / len(books)
        self.most_common_words = FreqDist(word for book in books for word, count in book.most_common_words for _ in range(count)).most_common(50)
        self.most_common_stopwords = FreqDist(word for book in books for word, count in book.most_common_stopwords for _ in range(count)).most_common(50)
        self.most_common_punct = FreqDist(word for book in books for word, count in book.most_common_punct for _ in range(count)).most_common(25)

    def get_data(self):
        return([self.vocab_richness, self.sentence_length, self.most_common_words, self.most_common_stopwords, self.most_common_punct])


In [19]:
The_Hobbit = Book(Tolkien_df, "Hobbit")
Fellowship = Book(Tolkien_df, "Fellowship")
Two_Towers = Book(Tolkien_df, "Two_Towers")
Return = Book(Tolkien_df, "Return")

Tolkien = Author("Tolkien", [The_Hobbit, Fellowship, Two_Towers, Return])

Lion = Book(Lewis_df, "Lion")
Caspian = Book(Lewis_df, "Caspian")

Lewis = Author("Lewis", [Lion, Caspian])

Poppins = Book(Travers_df, "Poppins")

Travers = Author("Travers", [Poppins])

Legend = Book(Matheson_df, "Legend")

Matheson = Author("Matheson", [Legend])

authors = [Tolkien, Lewis, Travers, Matheson]

In [20]:
Test_1 = Book(Tolkien_df, "Test_1")
Test_2 = Book(Lewis_df, "Test_2")
Test_3 = Book(Travers_df, "Test_3")
Test_4 = Book(Matheson_df, "Test_4")

**Metrices for comparison:**

Vocabulary richness

Average sentence length

50 most common words

50 most common stopwords

25 most common punctuative items

In [21]:
def compare_books(book1, book2):
    book1_data = book1.get_data()
    book2_data = book2.get_data()
    vocab_diff = 1 - abs(book1_data[0] - book2_data[0])
    sent_len = 1 - (abs(book1_data[1][0] - book2_data[1][0]) / max([book1_data[1][1], book2_data[1][1]]))
    words_diff = len(set(word for word, count in book1_data[2]) & set(word for word, count in book2_data[2])) / 50
    words_diff_set = set(word for word, count in book1_data[2]) & set(word for word, count in book2_data[2])
    stopwords_diff = len(set(word for word, count in book1_data[3]) & set(word for word, count in book2_data[3])) / 50
    stopwords_diff_set = set(word for word, count in book1_data[3]) & set(word for word, count in book2_data[3])
    punct_diff = len(set(word for word, count in book1_data[4]) & set(word for word, count in book2_data[4])) / 25
    punct_diff_set = set(word for word, count in book1_data[4]) & set(word for word, count in book2_data[4])
    return [vocab_diff, sent_len, words_diff, words_diff_set, stopwords_diff, stopwords_diff_set, punct_diff, punct_diff_set]

In [22]:
def get_one_value_for_comparison(values):
    # Standard deviation of vocabulary richness
    vocabs = []
    for entry in values:
        vocabs.append(entry[0])
    if len(values) > 1:
        vocab = 1 - statistics.stdev(vocabs)
    else:
        vocab = entry[0]
    # So that a high value means that the vocabulary richnesses are similar

    # Standard deviation of average sentence lengths
    lens = []
    for entry in values:
        lens.append(entry[1])
    if len(values) > 1:
        lent = 1 - statistics.stdev(lens)
    else:
        lent = entry[1]

    # Proportion of 50 most common words shared by all compared texts
    words_diff = len(reduce(lambda x, y: x & y, (row[3] for row in values))) / 50

    # Proportion of 50 most common stopwords shared by all compared texts
    stopwords_diff = len(reduce(lambda x, y: x & y, (row[5] for row in values))) / 50

    # Proportion of 25 most common puncts shared by all compared texts
    puncts_diff = len(reduce(lambda x, y: x & y, (row[7] for row in values))) / 25

    return (vocab + lent + words_diff + stopwords_diff + puncts_diff) / 5

In [27]:
def test_new_text(text):
    summary_comparisons = {}
    # Get a dictionary with author and a number from 0 to 1 predicting how similar the text is to the author's other works
    for author in authors:
        comparisons = []
        for book in author.books:
            comparisons.append(compare_books(text, book))
        summary_comparisons[author] = get_one_value_for_comparison(comparisons)

    # Go through each author and their likelihood of having written this text, get the most likely one
    highest = 0
    guess_author = ""
    for key, value in summary_comparisons.items():
        if value > highest:
            highest = value
            guess_author = key.name
    
    print("The author of this text is most likely", guess_author, "with similarity", round(highest * 100, 2),"%")

In [28]:
test_new_text(Test_1) # Sample of chapter 10 from each of the four Tolkien books
test_new_text(Test_2) # Sample of chapters 14 and 15 from C.S. Lewis' The Lion, The Witch, and the Wardrobe
test_new_text(Test_3) # Sample of chapter 12 from P.L. Travers' Mary Poppins
test_new_text(Test_4) # Sample of chapters 20 and 21 from Matheson's I Am Legend

The author of this text is most likely Tolkien with similarity 75.16 %
The author of this text is most likely Lewis with similarity 70.25 %
The author of this text is most likely Travers with similarity 66.46 %
The author of this text is most likely Matheson with similarity 69.81 %
