In [10]:
def read_book(title_path):
    """
    Read a book and return it as a string.
    """
    with open(title_path, "r", encoding="utf8") as current_file:
        text = current_file.read()
        text = text.replace("\n", "").replace("\r", "")
        return text

In [11]:
text = read_book("./Books/English/shakespeare/Romeo and Juliet.txt")

In [13]:
len(text)

169275

In [14]:
ind = text.find("What's in a name?")

In [15]:
ind

42757

In [16]:
sample_text = text[ind: ind + 1000]

In [17]:
sample_text

"What's in a name? That which we call a rose    By any other name would smell as sweet.    So Romeo would, were he not Romeo call'd,    Retain that dear perfection which he owes    Without that title. Romeo, doff thy name;    And for that name, which is no part of thee,    Take all myself.  Rom. I take thee at thy word.    Call me but love, and I'll be new baptiz'd;    Henceforth I never will be Romeo.  Jul. What man art thou that, thus bescreen'd in night,    So stumblest on my counsel?  Rom. By a name    I know not how to tell thee who I am.    My name, dear saint, is hateful to myself,    Because it is an enemy to thee.    Had I it written, I would tear the word.  Jul. My ears have yet not drunk a hundred words    Of that tongue's utterance, yet I know the sound.    Art thou not Romeo, and a Montague?  Rom. Neither, fair saint, if either thee dislike.  Jul. How cam'st thou hither, tell me, and wherefore?    The orchard walls are high and hard to climb,    And the place death, consid

In [34]:
def count_words(text):
    """
    Count the number of times each word occurs in text (str).
    Return dictionary where keys are words an values are word counts.
    Skip punctuation
    """
    text = text.lower()
    skips = [".", ",", ";", ":", "'", '"']
    for ch in skips:
        text = text.replace(ch, "")
    word_counts = {}
    for word in text.split(" "):
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
    return word_counts

In [35]:
count_words(sample_text)

{'whats': 1,
 'in': 2,
 'a': 5,
 'name?': 1,
 'that': 6,
 'which': 3,
 'we': 1,
 'call': 2,
 'rose': 1,
 '': 57,
 'by': 2,
 'any': 1,
 'other': 1,
 'name': 5,
 'would': 3,
 'smell': 1,
 'as': 1,
 'sweet': 1,
 'so': 2,
 'romeo': 5,
 'were': 1,
 'he': 2,
 'not': 4,
 'calld': 1,
 'retain': 1,
 'dear': 2,
 'perfection': 1,
 'owes': 1,
 'without': 1,
 'title': 1,
 'doff': 1,
 'thy': 2,
 'and': 6,
 'for': 1,
 'is': 3,
 'no': 1,
 'part': 1,
 'of': 2,
 'thee': 5,
 'take': 2,
 'all': 1,
 'myself': 2,
 'rom': 3,
 'i': 7,
 'at': 1,
 'word': 2,
 'me': 2,
 'but': 1,
 'love': 1,
 'ill': 1,
 'be': 2,
 'new': 1,
 'baptizd': 1,
 'henceforth': 1,
 'never': 1,
 'will': 1,
 'jul': 3,
 'what': 1,
 'man': 1,
 'art': 2,
 'thou': 3,
 'thus': 1,
 'bescreend': 1,
 'night': 1,
 'stumblest': 1,
 'on': 1,
 'my': 3,
 'counsel?': 1,
 'know': 2,
 'how': 2,
 'to': 4,
 'tell': 2,
 'who': 1,
 'am': 1,
 'saint': 2,
 'hateful': 1,
 'because': 1,
 'it': 2,
 'an': 1,
 'enemy': 1,
 'had': 1,
 'written': 1,
 'tear': 1,
 'the'

In [36]:
from collections import Counter

In [37]:
def count_words_fast(text):
    """
    Count the number of times each word occurs in text (str).
    Return dictionary where keys are words an values are word counts.
    Skip punctuation
    """
    text = text.lower()
    skips = [".", ",", ";", ":", "'", '"']
    for ch in skips:
        text = text.replace(ch, "")
    word_counts = Counter(text. split(" "))
    return word_counts

In [38]:
count_words_fast(sample_text)

Counter({'whats': 1,
         'in': 2,
         'a': 5,
         'name?': 1,
         'that': 6,
         'which': 3,
         'we': 1,
         'call': 2,
         'rose': 1,
         '': 57,
         'by': 2,
         'any': 1,
         'other': 1,
         'name': 5,
         'would': 3,
         'smell': 1,
         'as': 1,
         'sweet': 1,
         'so': 2,
         'romeo': 5,
         'were': 1,
         'he': 2,
         'not': 4,
         'calld': 1,
         'retain': 1,
         'dear': 2,
         'perfection': 1,
         'owes': 1,
         'without': 1,
         'title': 1,
         'doff': 1,
         'thy': 2,
         'and': 6,
         'for': 1,
         'is': 3,
         'no': 1,
         'part': 1,
         'of': 2,
         'thee': 5,
         'take': 2,
         'all': 1,
         'myself': 2,
         'rom': 3,
         'i': 7,
         'at': 1,
         'word': 2,
         'me': 2,
         'but': 1,
         'love': 1,
         'ill': 1,
         'be': 2,

In [39]:
count_words(text) == count_words_fast(text)

True

In [40]:
def word_stats(word_counts):
    """Return number of unique words and words frequencies."""
    num_unique = len(word_counts)
    counts = word_counts.values()
    return (num_unique, counts)

In [41]:
word_counts = count_words_fast(text)

In [42]:
(num_unique, counts) = word_stats(word_counts)

In [43]:
num_unique

5118

In [44]:
sum(counts)

40776

In [47]:
text = read_book("./Books/German/shakespeare/Romeo und Julia.txt")
word_counts = count_words_fast(text)
(num_unique, counts) = word_stats(word_counts)

In [48]:
num_unique

7527

In [49]:
sum(counts)

20311

In [51]:
import os

In [52]:
book_dir = "./Books"

In [77]:
import pandas as pd
stats = pd.DataFrame(columns = ("language", "author", "title", "length", "unique"))

In [78]:
title_num = 1
for language in os.listdir(book_dir):
    for author in os.listdir(book_dir + "/" + language):
        for title in os.listdir(book_dir + "/" + language + "/" + author):
            inputfile = book_dir + "/" + language + "/" + author + "/" + title
            print(inputfile)
            text = read_book(inputfile)
            (num_unique, counts) = word_stats(count_words_fast(text))
            stats.loc[title_num] = language, author.capitalize(), title.replace(".txt", ""), sum(counts), num_unique
            title_num += 1

./Books/English/shakespeare/A Midsummer Night's Dream.txt
./Books/English/shakespeare/Hamlet.txt
./Books/English/shakespeare/Macbeth.txt
./Books/English/shakespeare/Othello.txt
./Books/English/shakespeare/Richard III.txt
./Books/English/shakespeare/Romeo and Juliet.txt
./Books/English/shakespeare/The Merchant of Venice.txt
./Books/French/chevalier/L'a╠èle de sable.txt
./Books/French/chevalier/L'enfer et le paradis de l'autre monde.txt
./Books/French/chevalier/La capitaine.txt
./Books/French/chevalier/La fille des indiens rouges.txt
./Books/French/chevalier/La fille du pirate.txt
./Books/French/chevalier/Le chasseur noir.txt
./Books/French/chevalier/Les derniers Iroquois.txt
./Books/French/de Maupassant/Boule de Suif.txt
./Books/French/de Maupassant/Claire de Lune.txt
./Books/French/de Maupassant/Contes de la Becasse.txt
./Books/French/de Maupassant/L'inutile beautC╠º.txt
./Books/French/de Maupassant/La Main Gauche.txt
./Books/French/de Maupassant/La Maison Tellier.txt
./Books/French/de

In [79]:
stats

Unnamed: 0,language,author,title,length,unique
1,English,Shakespeare,A Midsummer Night's Dream,16103,4345
2,English,Shakespeare,Hamlet,28551,6776
3,English,Shakespeare,Macbeth,16874,4780
4,English,Shakespeare,Othello,26590,5898
5,English,Shakespeare,Richard III,48315,5449
...,...,...,...,...,...
98,Portuguese,Queir┬ós,O crime do padre Amaro,128630,29300
99,Portuguese,Queir┬ós,O Mandarim,21440,7836
100,Portuguese,Queir┬ós,O Primo Bazilio,107303,27644
101,Portuguese,Queir┬ós,Os Maias,195771,40665


In [70]:
table = pd.DataFrame(columns = ("name", "age"))

In [59]:
table.loc[1] = "James", 22

In [60]:
table.loc[2] = "Jess", 32

In [61]:
table

Unnamed: 0,name,age
1,James,22
2,Jess,32


In [62]:
table.columns

Index(['name', 'age'], dtype='object')