In [1]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Histograms

In [None]:
netflix = Table.read_table('https://raw.githubusercontent.com/eysalee/webpage/refs/heads/main/courses/f25/netflix.csv')

In [None]:
my_bins = np.arange(0,10.5, 0.5)
netflix.where('type', 'SHOW').hist('imdb_score', bins = my_bins)

In [2]:
# Top 200 highest grossing movies up to 2017
top_movies = Table.read_table('https://raw.githubusercontent.com/eysalee/webpage/refs/heads/main/courses/f25/top_movies_2017.csv')

In [3]:
ages = 2025 - top_movies.column('Year')
top_movies = top_movies.with_column('Age', ages)

In [4]:
top_movies

Title,Studio,Gross,Gross (Adjusted),Year,Age
Gone with the Wind,MGM,198676459,1796176700,1939,86
Star Wars,Fox,460998007,1583483200,1977,48
The Sound of Music,Fox,158671368,1266072700,1965,60
E.T.: The Extra-Terrestrial,Universal,435110554,1261085000,1982,43
Titanic,Paramount,658672302,1204368000,1997,28
The Ten Commandments,Paramount,65500000,1164590000,1956,69
Jaws,Universal,260000000,1138620700,1975,50
Doctor Zhivago,MGM,111721910,1103564200,1965,60
The Exorcist,Warner Brothers,232906145,983226600,1973,52
Snow White and the Seven Dwarves,Disney,184925486,969010000,1937,88


In [None]:
# Not specifying bins will give default values
top_movies.hist('Age', unit='Year')

In [None]:
# We can specify equally sized bins
equal_bins = np.arange(0, 111, 10)
top_movies.hist('Age', bins=equal_bins, unit='Year')

With the new bins, you can see an increase from 0-10 to 10-20.

You can also see there are no top movies from 90-100

In [None]:
# Let's do unequal bins
my_bins = make_array(0, 5, 10, 15, 25, 40, 65, 100, 105)
top_movies.hist('Age',bins=my_bins,unit='Year')

### Question: What percent of the top movies are from 40-65 years ago?

### Question: What is the height of the 40-65 bin?

In [None]:
# Step 1: Calculate % of movies in the [40, 65) bin


In [None]:
# Step 2: Calculate the width of the 40-65 bin


In [None]:
# Step 3: Area of rectangle = height * width
#         --> height = percent / width


This height represents the percent per year (the % of top movies that came out within this age range)

## Weather Data Demo

Let's load some weather data.

There's a lot of extra data in here, so let's go ahead and remove it. We're going to be interested in whether or not there was rain or snow and not specifically the amount, so let's also add in an extra column that says if there were rainy or snowy days.

In [6]:
weather = Table.read_table("https://raw.githubusercontent.com/eysalee/webpage/refs/heads/main/courses/f25/nyc_temperature_2019.csv")

# Cleaning the data, don't worry about this for now
# We'll go into this later
def add_condition(precipitation, new_snow):
    if new_snow > 0:
        return 'snowy'
    elif precipitation > 0:
        return 'rainy'
    else:
        return ''
    
# Our function takes in two arguments, 
# so we specify the two columns that the data should come from
conditions_array = weather.apply(add_condition, 'precipitation', 'new_snow')

weather = weather.with_columns('condition', conditions_array)

# Let's remove some columns we don't care about
weather = weather.drop('departure', 'HDD', 'CDD', 'precipitation', 'new_snow', 'snow_depth')
weather

date,tmax,tmin,tavg,condition
1/1/19,60,40,50.0,rainy
2/1/19,41,35,38.0,
3/1/19,45,39,42.0,
4/1/19,47,37,42.0,
5/1/19,47,42,44.5,rainy
6/1/19,49,32,40.5,
7/1/19,35,26,30.5,
8/1/19,47,35,41.0,rainy
9/1/19,46,35,40.5,rainy
10/1/19,35,30,32.5,


### Question: Do days with hotter highs also tend to have hotter lows?

### Question: How do the number of rainy days compare with the number of snowy days?

### Question: How do the average tavg compare between rainy days, snowy days, and days without any precipitation?

### Question: What percent of days have a high of at least 75 degrees?

## Census Demo

We'll be looking at US age and sex census data from 2020 to 2024 (https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/national/asrh/nc-est2024-agesex-res.csv).

Some notes about the formatting of this data:
- "Sex" are represented by numbers 0, 1, 2
    * 0 refers to "Male and Female"
    * 1 is "Male"
    * 2 is "Female"
- The ESTIMATESBASE column refers to the estimated population in April 1, 2020
- POPESTIMATE columns refer to the estimated populations as of July 1 of each year
- Ages go up to 100
- Age 999 row represents the totals of the column (by "Sex")

In [5]:
census_url = "https://raw.githubusercontent.com/eysalee/webpage/refs/heads/main/courses/f25/nc-est2024-agesex-res.csv"
# census_filename = 'nc-est2024-agesex-res.csv'
census = Table.read_table(census_url)
census

SEX,AGE,ESTIMATESBASE2020,POPESTIMATE2020,POPESTIMATE2021,POPESTIMATE2022,POPESTIMATE2023,POPESTIMATE2024
0,0,3735085,3703341,3572166,3680587,3653650,3615598
0,1,3777552,3764698,3708865,3602662,3720079,3701824
0,2,3873736,3854287,3771032,3738864,3642354,3768012
0,3,3966696,3933667,3860489,3800263,3778530,3690129
0,4,4049711,4038296,3939654,3888578,3837784,3823751
0,5,4078129,4074896,4044039,3966181,3924020,3880536
0,6,4048712,4063071,4080393,4069107,4000239,3965148
0,7,4054232,4050053,4068326,4104251,4101661,4039527
0,8,4073479,4057336,4055184,4091521,4135609,4139497
0,9,4114609,4102425,4062356,4077757,4122524,4172964


### Table Formatting

In [None]:
# Let's start by getting rid of the ESTIMATESBASE2020 column


In [None]:
# Let's use .relabel to rename the columns


We can use the `.set_format` function to format how numbers are displayed. This is similar to how in a spreadsheet you may set data to be displayed as a number, percentage, date, or currency.

We'll keep it simple with `NumberFormatter` and `PercentFormatter` and not touch other types of formats for now.

In [None]:
census.set_format('2020', NumberFormatter)

In [None]:
census.set_format('2021', NumberFormatter)

In [None]:
# Notice that this does not change the underlying data
# We've only modified how it's formatted in the Table
census.column('2020').item(0)

In [None]:
# We can also give it multiple columns to format
census.set_format(['2022', '2023', '2024'], NumberFormatter)

### Column Arithmetic

Let's compute the change in total population between 2020 and 2024. 

Then let's take the difference between the values in the 2024 column (new pop) and subtract from the values in the 2020 column (initial pop).

Let's compute the percent change and add this as a new column titled `"Percent Change"`. We can format the data using `.set_format("Percent Change", PercentFormatter)`.

### Line Graphs

To start, let's graph the population by age in 2024

What if we want to see the population for 2020 and 2024?