In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

from urllib.request import urlopen 
import re
def read_url(url): 
    return re.sub('\\s+', ' ', urlopen(url).read().decode())

In [None]:
# You can use Python and the Jupyter environment to do simple arithmetic
2 + 3

In [None]:
# One of the powers of computing in data science, though, lies not in the ability to do simple arithmetic,
# but the ability of computers to handle large amounts of data. So, let's do that.
# Here, we're reading two full books into our computing environment:

huck_finn_url = 'https://www.inferentialthinking.com/data/huck_finn.txt'
huck_finn_text = read_url(huck_finn_url)
huck_finn_chapters = huck_finn_text.split('CHAPTER ')[44:] # This line stores the entire text of Huck Finn into huck_finn_chapters

little_women_url = 'https://www.inferentialthinking.com/data/little_women.txt'
little_women_text = read_url(little_women_url)
little_women_chapters = little_women_text.split('CHAPTER ')[1:]

In [None]:
# In a Jupyter Notebook, we can see the contents of each variable simple by typing it out and pressing enter
huck_finn_chapters

In [None]:
# It's all there, but it's a little overwhelming to look at.
# This code organizes the text into a table with one column: "Chapters"
Table().with_column('Chapters', huck_finn_chapters)

In [None]:
# Now, what interesting thing can we do with all of this text?
# Well, Python will allow us to count all occurrences of certain names in the text
# Apparently "Tom Sawyer" is an important character in the book, so let's count how many times
# his name appears:
np.char.count(huck_finn_chapters, 'Tom')

In [None]:
np.char.count(huck_finn_chapters, 'Jim')

In [None]:
# For readability, we can organize these counts into a table:
counts = Table().with_columns([
    'Tom', np.char.count(huck_finn_chapters, 'Tom'),
    'Jim', np.char.count(huck_finn_chapters, 'Jim'),
    'Huck', np.char.count(huck_finn_chapters, 'Huck'),
])
counts

In [None]:
# But it's still just a list of numbers. We can do one better and plot the cumulative counts on a graph:
# how many times in Chapter 1, how many times in Chapters 1 and 2, and so on.

cumulative_columns = []
for col_name in counts.labels:
    # Apply np.cumsum directly to the NumPy array underlying the Table column
    cumulative_columns.append(np.cumsum(counts.column(col_name)))

# Create the 'Chapter' numbers array
chapter_numbers = np.arange(1, 44, 1) # Generates 1, 2, ..., 43

# Create the new Table with cumulative counts and chapter numbers
# We need to explicitly pass the labels and the new cumulative arrays
cum_counts = Table().with_columns(
    counts.labels[0], cumulative_columns[0], # Tom's cumulative counts
    counts.labels[1], cumulative_columns[1], # Jim's cumulative counts
    counts.labels[2], cumulative_columns[2], # Huck's cumulative counts
    'Chapter', chapter_numbers # The new 'Chapter' column
)

print("\nCumulative Counts Table (first 5 rows):")
cum_counts.show(5)

cum_counts.plot(column_for_xticks=3)
plots.title('Cumulative Number of Times Name Appears')
plots.show();

In [None]:
# The chapters of Little Women

Table().with_column('Chapters', little_women_chapters)

In [None]:
# Counts of names in the chapters of Little Women

names = ['Amy', 'Beth', 'Jo', 'Laurie', 'Meg']
mentions = {name: np.char.count(little_women_chapters, name) for name in names}

counts = Table().with_columns([
        'Amy', mentions['Amy'],
        'Beth', mentions['Beth'],
        'Jo', mentions['Jo'],
        'Laurie', mentions['Laurie'],
        'Meg', mentions['Meg']
    ])

In [None]:
# Plot the cumulative counts

cumulative_columns = []
for col_name in counts.labels:
    # Apply np.cumsum directly to the NumPy array underlying the Table column
    cumulative_columns.append(np.cumsum(counts.column(col_name)))

# Create the 'Chapter' numbers array
chapter_numbers = np.arange(1, 48, 1) # Generates 1, 2, ..., 43

# Create the new Table with cumulative counts and chapter numbers
# We need to explicitly pass the labels and the new cumulative arrays
cum_counts = Table().with_columns(
    counts.labels[0], cumulative_columns[0], # Amy's cumulative counts
    counts.labels[1], cumulative_columns[1], # Beth's cumulative counts
    counts.labels[2], cumulative_columns[2], # Jo's cumulative counts
    counts.labels[3], cumulative_columns[3], # Laurie's cumulative counts
    counts.labels[4], cumulative_columns[4], # Meg's cumulative counts
    'Chapter', chapter_numbers # The new 'Chapter' column
)

print("\nCumulative Counts Table (first 5 rows):")
cum_counts.show(5)

cum_counts.plot(column_for_xticks=5)
plots.title('Cumulative Number of Times Name Appears')
plots.show();

In [None]:
# Here's another cool thing you can do in Python: 
# Count the characters - NOT characters like Amy, Beth, and Jo, but characters like 'D', 'A', 'S', and 'C' -
# of a string. Surrounding a string in len(...) gives you the number of letters, numbers, and spaces in 
# that string:
len('DASC 130')

In [None]:
len(huck_finn_text)

In [None]:
# In each chapter, count the number of all characters;
# call this the "length" of the chapter.
# Also count the number of periods.

length_hf = Table().with_columns([
        'Length', [len(s) for s in huck_finn_chapters],
        'Periods', np.char.count(huck_finn_chapters, '.')
    ])
length_lw = Table().with_columns([
        'Length', [len(s) for s in little_women_chapters],
        'Periods', np.char.count(little_women_chapters, '.')
    ])

In [None]:
# The counts for Huckleberry Finn

length_hf

In [None]:
# The counts for Little Women

length_lw

In [None]:
# Take a look at the ratio of the number of characters in the chapter to the number of periods in
# the chapter. Since a period marks the end of a sentence, this gives us the average length of a sentence
# in that chapter! 

# Look at the x = 100 mark. The largest number of characters for chapters with 100 periods
# is around 14,000. The smallest is around 11,000. That's around 110-140 characters per sentence.
# Does "110-140 characters" remind you of anything?
plots.figure(figsize=(10,10))
plots.scatter(length_hf[1], length_hf[0], color='darkblue')
plots.scatter(length_lw[1], length_lw[0], color='gold')
plots.xlabel('Number of periods in chapter')
plots.ylabel('Number of characters in chapter')
plots.show();