# Data Analysis of the Book Pride and Prejudice

## Data Collection

In [1]:
try:
    with open('pride_prejudice.txt') as ppbook:
        lines = ppbook.readlines()
        print(len(lines))
except FileNotFoundError:
    print('File not found.')
            

14061


In [2]:
lines[0:15]

['      Chapter 1\n',
 '\n',
 '      It is a truth universally acknowledged, that a single man in\n',
 '      possession of a good fortune, must be in want of a wife.\n',
 '\n',
 '      However little known the feelings or views of such a man may be\n',
 '      on his first entering a neighbourhood, this truth is so well\n',
 '      fixed in the minds of the surrounding families, that he is\n',
 '      considered the rightful property of some one or other of their\n',
 '      daughters.\n',
 '\n',
 '      “My dear Mr. Bennet,” said his lady to him one day, “have you\n',
 '      heard that Netherfield Park is let at last?”\n',
 '\n',
 '      Mr. Bennet replied that he had not.\n']

## Data Munging

In [3]:
# remove punctuation and special characters
import re

### Data Cleaning

In [4]:
def clean_line(line: str) -> str:
    # remove trailing and leading spaces
    # remove punctuation
    line = re.sub(r'[\n,;`:."“”‘’„”«»><{}\[\]|+=_()*&%$#@!~\'?]', '', line.strip())
    return line.replace('—', ' ')

In [5]:
clean_lines = list(map(clean_line, lines))

In [6]:
clean_lines[0:15]

['Chapter 1',
 '',
 'It is a truth universally acknowledged that a single man in',
 'possession of a good fortune must be in want of a wife',
 '',
 'However little known the feelings or views of such a man may be',
 'on his first entering a neighbourhood this truth is so well',
 'fixed in the minds of the surrounding families that he is',
 'considered the rightful property of some one or other of their',
 'daughters',
 '',
 'My dear Mr Bennet said his lady to him one day have you',
 'heard that Netherfield Park is let at last',
 '',
 'Mr Bennet replied that he had not']

In [7]:
# remove empty lines
clean_lines0 = [line for line in clean_lines if not line == '']

In [8]:
# time using list comprehension
%timeit [line for line in clean_lines if not line == '']

540 µs ± 1.05 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
clean_lines1 = list(filter(lambda x: x != '', clean_lines))

In [10]:
# time using filter
%timeit list(filter(lambda x: x != '', clean_lines))

1.05 ms ± 1.16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [11]:
clean_lines2 = [line for line in clean_lines if not re.fullmatch(r'^\s*$', line)]

In [12]:
# time using regex and list comprehension
%timeit [line for line in clean_lines if not re.fullmatch(r'^\s*$', line)]

9.18 ms ± 41.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
clean_lines0[0:10]

['Chapter 1',
 'It is a truth universally acknowledged that a single man in',
 'possession of a good fortune must be in want of a wife',
 'However little known the feelings or views of such a man may be',
 'on his first entering a neighbourhood this truth is so well',
 'fixed in the minds of the surrounding families that he is',
 'considered the rightful property of some one or other of their',
 'daughters',
 'My dear Mr Bennet said his lady to him one day have you',
 'heard that Netherfield Park is let at last']

In [14]:
clean_lines1[0:10]

['Chapter 1',
 'It is a truth universally acknowledged that a single man in',
 'possession of a good fortune must be in want of a wife',
 'However little known the feelings or views of such a man may be',
 'on his first entering a neighbourhood this truth is so well',
 'fixed in the minds of the surrounding families that he is',
 'considered the rightful property of some one or other of their',
 'daughters',
 'My dear Mr Bennet said his lady to him one day have you',
 'heard that Netherfield Park is let at last']

In [15]:
clean_lines2[0:10]

['Chapter 1',
 'It is a truth universally acknowledged that a single man in',
 'possession of a good fortune must be in want of a wife',
 'However little known the feelings or views of such a man may be',
 'on his first entering a neighbourhood this truth is so well',
 'fixed in the minds of the surrounding families that he is',
 'considered the rightful property of some one or other of their',
 'daughters',
 'My dear Mr Bennet said his lady to him one day have you',
 'heard that Netherfield Park is let at last']

In [16]:
clean_lines0 == clean_lines2 and clean_lines1 == clean_lines0

True

### Data Manipulation

In [17]:
# Tokenize each sentence into words
words_list = list(map(lambda x: x.split(), clean_lines0))
# make every word the same case (upper or lower)
def list_to_lower(seq: list): return list(map(lambda x: x.lower(), seq))
words_list = list(map(list_to_lower , words_list))

In [18]:
# flattening the words_list
# concatenate all lists of words
wlist_flat = [word for words in words_list for word in words]

In [19]:
wlist_flat[0:15]

['chapter',
 '1',
 'it',
 'is',
 'a',
 'truth',
 'universally',
 'acknowledged',
 'that',
 'a',
 'single',
 'man',
 'in',
 'possession',
 'of']

In [20]:
# time flattening method 1
%timeit [word for words in words_list for word in words]

3.78 ms ± 15.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [21]:
# time flattening method 2
flattened_list = []

In [22]:
%%timeit 
for x in words_list:
    for y in x:
        flattened_list.append(y)

7.26 ms ± 13.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [23]:
flattened_list[0:15]

['chapter',
 '1',
 'it',
 'is',
 'a',
 'truth',
 'universally',
 'acknowledged',
 'that',
 'a',
 'single',
 'man',
 'in',
 'possession',
 'of']

## Data Statistics

#### Word Frequency

In [24]:
# create a set to get unique words
uniq_w = set(wlist_flat)

In [25]:
# create a dictionary with unique word: [occurence]
dic_occ = dict.fromkeys(uniq_w, 0)

In [26]:
%%timeit
# count occurrence of words
for word in wlist_flat:
    if word in dic_occ:
        dic_occ[word] += 1

21.1 ms ± 86.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [27]:
from collections import Counter

In [28]:
dic_occ1 = Counter(wlist_flat)

In [30]:
%timeit Counter(wlist_flat)

9.58 ms ± 32 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### Average Sencence Length

In [31]:
# get average sentence length
# map list to a list of its sublists lengths
# sum the mapped list
sentence_len = sum(list(map(lambda x: len(x), clean_lines0)))
avg_sent_len = sentence_len // len(clean_lines0)

In [32]:
print(f'Average Sentence Length: {avg_sent_len} characters')

Average Sentence Length: 54 characters


#### Total Character Count

In [33]:
total_chars = sum(map(lambda x: len(x), wlist_flat))
print(f'The total number of characters is: {total_chars:,}')

The total number of characters is: 536,835


#### Total Word Count

In [34]:
print(f'The total number of words is: {len(wlist_flat):,}')

The total number of words is: 121,870


#### Total Unique Words

In [35]:
print(f'The total number of unique words is: {len(uniq_w):,}')

The total number of unique words is: 6,488


#### Top 10 Longest Words

In [36]:
sorted_l = sorted(list(uniq_w), key=len, reverse=True)
# remove hyphenated words
sorted_hyphens = list(filter(lambda x: re.match(r'^(.*)-(.*)$', x), sorted_l))

sorted_no_hyphens = list(filter(lambda x: not re.match(r'^(.*)-(.*)', x), sorted_l))

In [37]:
print(f"{'Top 10 Longest Words':^24}\n")
print(f"{'Word':<20}{'Size':>4}\n")

for word in sorted_no_hyphens[0:10]:
    print(f'{word:<20}{len(word):>4}')

  Top 10 Longest Words  

Word                Size

communicativeness     17
disinterestedness     17
misrepresentation     17
superciliousness      16
discontentedness      16
incomprehensible      16
superintendence       15
acknowledgments       15
thoughtlessness       15
inconsistencies       15
