### Project Overview:


In [16]:
import os
import re
from matplotlib import pyplot as plt
from lexicalrichness import LexicalRichness
import pandas as pd

In [17]:
def load_book(filename: str) -> str:
    '''
    Takes a filename of a txt file from project gutenberg and strips the text from the file.
    '''
    lines = [l.rstrip() for l in open(filename, 'r', encoding='utf8')]
    
    exp = re.compile(r'^\*\*\* (START|END).*PROJECT GUTENBERG.*\*\*\*$')
    start, end = [lines.index(l) for l in lines if exp.match(l)]
    lines = lines[start+1:end]
    text =  ' '.join(lines)
    return text

### Collect The Book Paths and Create a Dictionary

In [18]:
books_dir = 'books'
engl_books = os.listdir(books_dir + '/English')
germ_books = os.listdir(books_dir + '/German')
fren_books = os.listdir(books_dir + '/French')

book_paths = {'English':[], 'French':[], 'German':[]}
print('The following books have been collected: ')
print('\t{:>3s} {:<30s} {:<10s} {:<10s} {:<10s}'.format('#', 'title', 'English', 'French', 'German'))
for i, book in enumerate(set(engl_books) | set(germ_books) | set(fren_books)):
    title = book.rstrip('.txt')
    if len(title) > 22: title = title[:23] + '...'
    in_engl, in_fren, in_germ = book in engl_books, book in fren_books, book in germ_books
    print('\t{:>3d} {:<30s} {:<10b} {:<10b} {:<10b}'.format(i+1, title, in_engl, in_fren, in_germ))
    if in_engl:
        book_paths['English'].append(books_dir + '/English/' + book)
    if in_fren:
        book_paths['French'].append(books_dir + '/French/' + book)
    if in_germ:
        book_paths['German'].append(books_dir + '/German/' + book)

The following books have been collected: 
	  # title                          English    French     German    
	  1 Oliver Twist Charles Di...     1          1          1         
	  2 Moby-Dick Herman Melvil...     1          0          0         
	  3 The Call of the Wild Ja...     1          0          0         
	  4 The Great Gatsby F Scot...     1          0          0         
	  5 Winnie-the-Pooh A A Mil...     1          0          0         
	  6 Alices Adventures in Wo...     1          1          1         
	  7 Treasure Island Robert ...     1          1          1         
	  8 Pride and Prejudice Jan...     1          0          0         
	  9 A Christmas Carol Charl...     1          1          1         
	 10 Dracula Bram Stoker            1          0          0         
	 11 Frankenstein Mary W She...     1          1          0         


In [23]:
col_names = ['path', 'language', 'words', 'terms']
df = pd.DataFrame(columns=col_names)
for k in book_paths.keys():
    for path in book_paths[k]:
        text = load_book(path)
        lex = LexicalRichness(text)
        df.loc[len(df.index)] = [path, k, lex.words, lex.terms]

    
print(df)

                                                 path language   words  terms
0      books/English/Oliver Twist Charles Dickens.txt  English  162295  11895
1         books/English/Moby-Dick Herman Melville.txt  English  214483  20118
2   books/English/The Call of the Wild Jack London...  English   32003   4962
3   books/English/The Great Gatsby F Scott Fitzger...  English   49513   6644
4         books/English/Winnie-the-Pooh A A Milne.txt  English   23451   1939
5   books/English/Alices Adventures in Wonderland ...  English   27587   3048
6   books/English/Treasure Island Robert Louis Ste...  English   69409   6878
7   books/English/Pride and Prejudice Jane Austen.txt  English  123394   6973
8   books/English/A Christmas Carol Charles Dicken...  English   28956   4384
9               books/English/Dracula Bram Stoker.txt  English  161794  10169
10      books/English/Frankenstein Mary W Shelley.txt  English   78111   7406
11      books/French/Oliver Twist Charles Dickens.txt   French  