In [None]:
# Stuff that will appear at the top of notebooks;
# You don't have to understand how this works or change it for now.

from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

from urllib.request import urlopen 
import re
def read_url(url): 
    return re.sub('\\s+', ' ', urlopen(url).read().decode())

# Demo 1: SBCC Basic Needs - Nutritional Health and Food Security
[SBCC Basic Needs Website](https://www.sbcc.edu/equity/basic-needs-programs/)
*Food Pantry*, 
*CalFresh*, is a nutrition assistance program that helps low income individuals and families buy food that they need

[<img src="calfresh-logo.png" width="150" height="70"/>](https://www.sbcc.edu/equity/basic-needs-programs/calfresh-information.php)


The data set used below came from: https://www.cdss.ca.gov/inforesources/data-portal/research-and-data/calfresh-data-dashboard

In [None]:
CalFresh_recipients = Table().read_table('CalFreshAnnual.csv')
CalFresh_recipients

In [None]:
# Try plotting/visualizing just the state data
CalFresh_recipients.plot('Year of Calendar Year', 'Statewide')

In [None]:
# This code will fix the formatting error above :)

formatter = NumberFormatter()
for i in range(1, CalFresh_recipients.num_columns):
    CalFresh_recipients = CalFresh_recipients.with_column(CalFresh_recipients.labels[i], CalFresh_recipients.apply(formatter.convert_value,i))
CalFresh_recipients
# Let's try again
CalFresh_recipients.plot('Year of Calendar Year', 'Statewide')

#TODO: Let's compare the trend of SB with the Statewide trend, 
# copy the line of code above and replace 'Statewide'  to 'Santa Barbara'


# Demo 2: let's look at a classic novel
This is from example in book section 1.3

In [None]:
# Read two books, fast!

huck_finn_url = 'https://www.inferentialthinking.com/data/huck_finn.txt'
# Read the book and save it as a string
huck_finn_text = read_url(huck_finn_url)
#Split the book up by chapter and save the chapters as separate strings in a list                
huck_finn_chapters = huck_finn_text.split('CHAPTER ')[44:]

little_women_url = 'https://www.inferentialthinking.com/data/little_women.txt'
little_women_text = read_url(little_women_url)
little_women_chapters = little_women_text.split('CHAPTER ')[1:]

In [None]:
huck_finn_chapters

### What can we learn about the book: 
### The Adventures of Huckleberry Finn by Mark Twain


In [None]:
#let's create a table with a column representing chapters from Huck Finn
huck_finn = Table().with_column('Chapters', huck_finn_chapters)
huck_finn


In [None]:
# Let's count how many times Tom is mentioned
np.char.count(huck_finn_chapters, 'Tom')

In [None]:
# TODO: Let's count how many times Jim and Huck is mentioned
...

In [None]:
# TODO: replace the ... to put the counts of how many times the names Jim, Tom, and Huck appear in each chapter in table format
counts = Table().with_columns('Tom',np.char.count(huck_finn_chapters, 'Tom'),
                             ...,
                             ...)
counts


In [None]:
# Now just run this cell:
# Plot the cumulative counts:
# how many times in Chapter 1, how many times in Chapters 1 and 2, and so on.

cum_counts = counts.cumsum().with_column('Chapter', np.arange(1, 44, 1))
cum_counts.plot(column_for_xticks=3)
plots.title('Cumulative Number of Times Name Appears');

### What can we guess about the characters from this visualization?

# Any other questions that come up?
#### How could you guess who the author is? What kind of patterns would you look for?

In [None]:
# In each chapter, count the number of all characters;
# call this the "length" of the chapter.
# Also count the number of periods.

chars_periods_hf = Table().with_columns([
        'HF Chapter Length', [len(s) for s in huck_finn_chapters],
        'Number of Periods', np.char.count(huck_finn_chapters, '.')
    ])


In [None]:
# The counts for Huckleberry Finn

chars_periods_hf.show(3)

In [None]:
# What can we observe?

In [None]:
plots.figure(figsize=(10,10))
plots.scatter(chars_periods_hf[1], chars_periods_hf[0], color='darkblue')
plots.xlabel('Number of periods in chapter')
plots.ylabel('Number of characters in chapter');

# On Average what appears to be the sentence length?

Can estimate the length of a sentence based on the number of characters and number of periods?

In [None]:
sentenceLength_hf = chars_periods_hf[0]/chars_periods_hf[1]
sentenceLength_hf


In [None]:
Table().with_columns('Huck', sentenceLength_hf).hist()

Interestingly: on Twitter/X: max length of tweet was 140 until 2017, and only recently became longer. Today it is 280. Check out this interesting study on how character limits affected language useage: https://www.nature.com/articles/s41599-019-0280-3

# Demo 3: Air Quality Data


The table `California_airquality.csv` contains data on air quality in California 1/1/2020 through 9/10/2020 of the air quality index in California as measured by PM2.5, the main pollutant from fire (data is from 2 sources: AQS, AirNow). I found the dataset on Kaggle, direct link here:https://www.kaggle.com/thaddeussegura/california-air-quality-2020-through-sept10th. Kaggle is an open source/free platform where anyone interested in data science can share code and data sets (https://www.kaggle.com/datasets).

In [None]:
CalAQI = Table.read_table('California_airquality.csv')
CalAQI
#TODO: narrow down table to only selecting what we want to look at
# Use select and select: Date, Site Name, DAILY_AQI_VALUE
# onlyAQI = ...
# onlyAQI

### When was the air quality bad in Santa Barbara during that time? 
### What AQI value means bad air quality?
See https://www.airnow.gov/aqi/aqi-basics/


Finding where a specific value in a column is set to value of interest:
for example, let's see if we can find Site Name = SB

In [None]:
# look for when Santa Barbara had high AQI during this time
onlyAQI.where('Site Name', 'Santa Barbara').where('DAILY_AQI_VALUE', are.above(50))
# TODO: change the line above to look at values where the air quality is unhealthy 


# Demo 4: Classification Example in Medicine
### Can we predict kidney disease based on a set of attributes?

In [None]:
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)

In [None]:
ckd.group('Class')

In [None]:
color_table = Table().with_columns(
    'Class', make_array(1, 0),
    'Color', make_array('darkblue', 'gold')
)
ckd = ckd.join('Class', color_table)

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', group='Color')
ckd.scatter('Hemoglobin', 'Glucose', group='Color')
