### Import packages!

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

## Task 1
Import the WIC dataset and store the data in a variable called `wic_data`. The full filename for our dataset is `"datasets/WIC_data.csv"` because the file is stored in a datasets folder.

Then display the first few entries of the dataset.

## Task 2: Region categories

In [None]:
# store the unique region names in the "regions" variable
regions = wic_data["REGION"].unique()

# print the number of regions and the names of these regions
print("There are", len(regions), "regions included in this dataset.")
print()
print("These regions are:", regions)

#### Total up number of enrollees in each region category

In [None]:
# group the data by "region"
grouped_wic_data = wic_data.groupby(by="REGION") #1#

# total up the number of enrollees in each region category
enrollee_totals = {} #this dictionary will store the total number of enrollees for each region

for region, region_data in grouped_wic_data: #2#
    group_total = 0 #total number of enrollees for the current region grouping
    # reset to 0 each time we loop through a new region grouping
    
    # get all totals for this region across all race groups from the "Total of all Racial Categories Total" column
    totals = region_data["Total of all Racial Categories Total"] 
    
    # add up all of these total for this region
    region_total = sum(totals) #3#
    
    # add to dictionary
    enrollee_totals[region] = region_total

# create "total_per_region" list
total_per_region = [enrollee_totals[r] for r in regions]

print("Enrollee totals by region:", enrollee_totals)

#### Further code explanation:
> - `#1#`: We use the `groupby` function to group the data by "region" because each region has multiple entries in the dataset.
    - `groupby` is a function that can be applied to `pandas` DataFrames
    - We can give it one argument, the name of a column, and the function will group the data according to the distinct values in this column
    - `by="REGION"` is how we pass this argument to the function in the code above; we specify that the function should store this argument in the variable `by` because this function can take many arguments so we want it to know exactly what kind of argument we are giving it
    - The return value of this function is stored in `grouped_wic_data` which holds, for each region category the name of this region category (`region`) and the subset of the larger dataset that corresponds to this region (`region_data`)
> - `#2#`: We loop through each `name, group` pair in `grouped_wic_data` and tally up the total number of enrollees across all race groups.
    - We use the `Total of all Racial Categories Total` column in the dataset which captures the total number of enrollees across all race groups for each record in the dataset 
> - `#3#`: `sum()` is a Python function that obtains the sum of a number of values in a list. Since `group["Total of all Racial Categories Total"]` is the list that corresponds to the "Total of all Racial Categories Total" column in the dataset, we can sum all of the values in this list.




#### Create bar chart

In [None]:
# the "regions" variable we previously created is where our category names are stored
# "total_per_region" is a list of the enrollee totals by region 

# create the plot #
fig, axs = plt.subplots(figsize=(30,15)) #set up axes
bargraph = axs.bar(regions, total_per_region)
#regions: x-axis labels, total_per_region: heights for each bar corresponding to enrollee total

# plot formatting #
axs.set_title("Number of Enrollees Per Region", fontsize=40)
axs.tick_params(labelsize=23)
axs.set_xlabel("Region", fontsize=35)
axs.set_ylabel("Number of enrollees", fontsize=35)

# show the plot #
plt.show()

#### Calculate category percentages

In [None]:
total_num_enrollees = np.sum(list(enrollee_totals.values()))
enroll_percents = [(enrollee_totals[r]/total_num_enrollees)*100 for r in regions]
print("% of total enrollees for each region:", enroll_percents)

#### Create pie chart

In [None]:
# create the plot #
fig, axs = plt.subplots(figsize=(25,15))  #set up axes
piechart = axs.pie(enroll_percents, labels=regions, autopct="%1.1f%%")
#enroll_percents: size of each slice, regions: labels for each slice
#autopct is an argument that controls how many decimal points to display on the chart

# plot formatting #
axs.axis("equal") # to ensure we get a circle shape instead of an oval
axs.set_title("Percentage of Total Enrollees Per Region", fontsize=40)
for i in range(len(piechart[1])): # change label font sizes
    piechart[1][i].set_fontsize(18)
    piechart[2][i].set_fontsize(18)

# show the plot #
plt.show()

## Task 3: Race categories

#### Total up number of enrollees in each race category

In [None]:
# manually list the race groups
race_groups = ["American Indian", "Asian", "Black", "Hawaiian/Pacific Islander", "White", "Multiple Race", "Race Not Reported"]

enrollee_totals = {}

for race in race_groups:

    enrollee_totals[race] = sum(wic_data[race + " Total"])

total_per_race = [enrollee_totals[r] for r in race_groups]

print("Enrollee totals by race:", enrollee_totals)

#### Create bar chart

In [None]:
## create the plot ##
fig, axs = plt.subplots(figsize=(30,15)) #set up axes
bargraph = axs.bar(###, ###)
    
## plot formatting ##
axs.set_title("Number of Enrollees Per Race Group", fontsize=40)
axs.tick_params(labelsize=23)
axs.set_xlabel("###", fontsize=35)
axs.set_ylabel("###", fontsize=35)

## show the plot ##
plt.show()