# Data Analysis of the Book Pride and Prejudice

## Data Collection

In [22]:
try:
    with open('pride_prejudice.txt') as ppbook:
        lines = ppbook.readlines()
        print(len(lines))
except FileNotFoundError:
    print('File not found.')
            

14416


In [119]:
lines[0:15]

['      Chapter 1\n',
 '\n',
 '      It is a truth universally acknowledged, that a single man in\n',
 '      possession of a good fortune, must be in want of a wife.\n',
 '\n',
 '      However little known the feelings or views of such a man may be\n',
 '      on his first entering a neighbourhood, this truth is so well\n',
 '      fixed in the minds of the surrounding families, that he is\n',
 '      considered the rightful property of some one or other of their\n',
 '      daughters.\n',
 '\n',
 '      “My dear Mr. Bennet,” said his lady to him one day, “have you\n',
 '      heard that Netherfield Park is let at last?”\n',
 '\n',
 '      Mr. Bennet replied that he had not.\n']

## Data Munging

In [24]:
# remove punctuation and special characters
import re

### Data Cleaning

In [161]:
def clean_line(line: str) -> str:
    # remove trailing and leading spaces
    # remove punctuation
    return re.sub(r'[\n,;`:."“”‘’„”«»><{}\[\]|+=_()*&%$#@!~\'?]', '', line.strip())

In [162]:
clean_lines = list(map(clean_line, lines))

In [163]:
clean_lines[0:15]

['Chapter 1',
 '',
 'It is a truth universally acknowledged that a single man in',
 'possession of a good fortune must be in want of a wife',
 '',
 'However little known the feelings or views of such a man may be',
 'on his first entering a neighbourhood this truth is so well',
 'fixed in the minds of the surrounding families that he is',
 'considered the rightful property of some one or other of their',
 'daughters',
 '',
 'My dear Mr Bennet said his lady to him one day have you',
 'heard that Netherfield Park is let at last',
 '',
 'Mr Bennet replied that he had not']

In [164]:
# remove empty lines
clean_lines0 = [line for line in clean_lines if not line == '']

In [165]:
# time using list comprehension
%timeit [line for line in clean_lines if not line == '']

559 µs ± 9.49 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [166]:
clean_lines1 = list(filter(lambda x: x != '', clean_lines))

In [167]:
# time using filter
%timeit list(filter(lambda x: x != '', clean_lines))

1.1 ms ± 11.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [173]:
clean_lines2 = [line for line in clean_lines if not re.fullmatch(r'^\s*$', line)]

In [174]:
# time using regex and list comprehension
%timeit [line for line in clean_lines if not re.fullmatch(r'^\s*$', line)]

9.78 ms ± 140 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [169]:
clean_lines0[0:10]

['Chapter 1',
 'It is a truth universally acknowledged that a single man in',
 'possession of a good fortune must be in want of a wife',
 'However little known the feelings or views of such a man may be',
 'on his first entering a neighbourhood this truth is so well',
 'fixed in the minds of the surrounding families that he is',
 'considered the rightful property of some one or other of their',
 'daughters',
 'My dear Mr Bennet said his lady to him one day have you',
 'heard that Netherfield Park is let at last']

In [170]:
clean_lines1[0:10]

['Chapter 1',
 'It is a truth universally acknowledged that a single man in',
 'possession of a good fortune must be in want of a wife',
 'However little known the feelings or views of such a man may be',
 'on his first entering a neighbourhood this truth is so well',
 'fixed in the minds of the surrounding families that he is',
 'considered the rightful property of some one or other of their',
 'daughters',
 'My dear Mr Bennet said his lady to him one day have you',
 'heard that Netherfield Park is let at last']

In [178]:
clean_lines2[0:10]

['Chapter 1',
 'It is a truth universally acknowledged that a single man in',
 'possession of a good fortune must be in want of a wife',
 'However little known the feelings or views of such a man may be',
 'on his first entering a neighbourhood this truth is so well',
 'fixed in the minds of the surrounding families that he is',
 'considered the rightful property of some one or other of their',
 'daughters',
 'My dear Mr Bennet said his lady to him one day have you',
 'heard that Netherfield Park is let at last']

In [176]:
clean_lines0 == clean_lines2 and clean_lines1 == clean_lines0

True

### Data Manipulation

In [221]:
# Tokenize each sentence into words
words_list = list(map(lambda x: x.split(), clean_lines0))
# make every word the same case (upper or lower)
def list_to_lower(seq: list): return list(map(lambda x: x.lower(), seq))
words_list = list(map(list_to_lower , words_list))

In [223]:
# flattening the words_list
# concatenate all lists of words
wlist_flat = [word for words in words_list for word in words]

In [232]:
wlist_flat[0:15]

['chapter',
 '1',
 'it',
 'is',
 'a',
 'truth',
 'universally',
 'acknowledged',
 'that',
 'a',
 'single',
 'man',
 'in',
 'possession',
 'of']

In [225]:
# time flattening method 1
%timeit [word for words in words_list for word in words]

3.83 ms ± 161 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [231]:
# time flattening method 2
flattened_list = []

In [229]:
%%timeit 
for x in words_list:
    for y in x:
        flattened_list.append(y)

7.61 ms ± 26.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [230]:
flattened_list[0:15]

['chapter',
 '1',
 'it',
 'is',
 'a',
 'truth',
 'universally',
 'acknowledged',
 'that',
 'a',
 'single',
 'man',
 'in',
 'possession',
 'of']

### Data Statistics

In [236]:
# create a set to get unique words
uniq_w = set(wlist_flat)
len(uniq_w)
# create a dictionary with unique word: [occurence]
# go through the list and count the number of occurrences of each word and update the dictionary


7111

In [182]:
# get average sentence length
# map list to a list of its sublists lengths
# sum the mapped list
sentence_len = sum(list(map(lambda x: len(x), clean_lines0)))
avg_sent_len = sentence_len // len(clean_lines0)

In [185]:
print(f'Average Sentence Length: {avg_sent_len} characters')

Average Sentence Length: 55 characters
