In [2]:
import pandas as pd
import numpy as np

## Attendance Data

Load the `attendance.csv` file and calculate an attendnace percentage for each student.  
One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

In [42]:
attendance = pd.read_csv("attendance.csv")

In [44]:
attendance.rename(columns={attendance.columns[0]:"student"},inplace=True)

In [45]:
attendance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 9 columns):
student       4 non-null object
2018-01-01    4 non-null object
2018-01-02    4 non-null object
2018-01-03    4 non-null object
2018-01-04    4 non-null object
2018-01-05    4 non-null object
2018-01-06    4 non-null object
2018-01-07    4 non-null object
2018-01-08    4 non-null object
dtypes: object(9)
memory usage: 368.0+ bytes


In [83]:
df = pd.melt(attendance,id_vars ="student", value_vars=attendance.columns[1:], var_name="date",value_name="status")
df

Unnamed: 0,student,date,status
0,Sally,2018-01-01,P
1,Jane,2018-01-01,A
2,Billy,2018-01-01,A
3,John,2018-01-01,P
4,Sally,2018-01-02,T
5,Jane,2018-01-02,P
6,Billy,2018-01-02,T
7,John,2018-01-02,T
8,Sally,2018-01-03,T
9,Jane,2018-01-03,T


In [100]:
df["calc"] = np.nan

conditions = [df.status == "P", df.status == "H", df.status == "A", df.status == "T"]
choices = [1, 0.5, 0, .10]

df.calc = np.select(conditions, choices)

In [101]:
df.head(12)

Unnamed: 0,student,date,status,calc
0,Sally,2018-01-01,P,1.0
1,Jane,2018-01-01,A,0.0
2,Billy,2018-01-01,A,0.0
3,John,2018-01-01,P,1.0
4,Sally,2018-01-02,T,0.1
5,Jane,2018-01-02,P,1.0
6,Billy,2018-01-02,T,0.1
7,John,2018-01-02,T,0.1
8,Sally,2018-01-03,T,0.1
9,Jane,2018-01-03,T,0.1


In [102]:
df.groupby("student").mean()

Unnamed: 0_level_0,calc
student,Unnamed: 1_level_1
Billy,0.225
Jane,0.1875
John,0.7125
Sally,0.3625


## Coffee Levels

Read the `coffee_levels.csv` file.  
Transform the data so that each carafe is in it's own column.  
Is this the best shape for the data?

In [103]:
coffee = pd.read_csv("coffee_levels.csv")

In [107]:
coffee.head()

Unnamed: 0,hour,coffee_carafe,coffee_amount
0,8,x,0.816164
1,9,x,0.451018
2,10,x,0.843279
3,11,x,0.335533
4,12,x,0.898291


In [117]:
coffee.pivot(index="hour",columns="coffee_carafe", values="coffee_amount")

coffee_carafe,x,y,z
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,0.816164,0.189297,0.999264
9,0.451018,0.521502,0.91599
10,0.843279,0.023163,0.144928
11,0.335533,0.235529,0.311495
12,0.898291,0.017009,0.771947
13,0.310711,0.997464,0.39852
14,0.507288,0.058361,0.864464
15,0.215043,0.144644,0.436364
16,0.183891,0.544676,0.280621
17,0.39156,0.594126,0.436677


## Cake Recipes

- Read the `cake_recipes.csv` data. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.
- Tidy the data as necessary.  
- Which recipe, on average, is the best? recipe b
- Which oven temperature, on average, produces the best results? 275
- Which combination of recipe, rack position, and temperature gives the best result? recipe b, bottom rack, 300 degrees

In [132]:
cake = pd.read_csv("cake_recipes.csv")

In [133]:
cake.head()

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847
2,b:bottom,57.09532,61.904369,61.19698,99.248541
3,b:top,82.455004,95.224151,98.594881,58.169349
4,c:bottom,96.470207,52.001358,92.893227,65.473084


In [141]:
cake["recipe"]= cake["recipe:position"].str.extract(r"(^\w{1})",expand=True)
cake["position"]= cake["recipe:position"].str.extract(r":(.*)$",expand=True)
cake = cake.drop(columns="recipe:position")

In [142]:
cake.head()

Unnamed: 0,225,250,275,300,recipe,position
0,61.738655,53.912627,74.41473,98.786784,a,bottom
1,51.709751,52.009735,68.576858,50.22847,a,top
2,57.09532,61.904369,61.19698,99.248541,b,bottom
3,82.455004,95.224151,98.594881,58.169349,b,top
4,96.470207,52.001358,92.893227,65.473084,c,bottom


In [144]:
agg_score = cake.groupby("recipe").mean()

In [153]:
agg_score

Unnamed: 0_level_0,225,250,275,300
recipe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,56.724203,52.961181,71.495794,74.507627
b,69.775162,78.56426,79.89593,78.708945
c,83.888258,67.398417,92.495638,59.716678
d,74.836465,67.385891,55.659653,53.577368


Which recipe, on average, is the best? recipe b

In [187]:
agg_score["average"] = agg_score.agg("mean",axis=1)
agg_score

Unnamed: 0_level_0,225,250,275,300,average
recipe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,56.724203,52.961181,71.495794,74.507627,63.922201
b,69.775162,78.56426,79.89593,78.708945,76.736074
c,83.888258,67.398417,92.495638,59.716678,75.874748
d,74.836465,67.385891,55.659653,53.577368,62.864844


Which oven temperature, on average, produces the best results? 275

In [188]:
cake_agg = cake.melt(id_vars=["recipe","position"],var_name="temp")
cake_agg.groupby("temp").mean()

Unnamed: 0_level_0,value
temp,Unnamed: 1_level_1
225,71.306022
250,66.577437
275,74.886754
300,66.627655


Which combination of recipe, rack position, and temperature gives the best result? recipe b, bottom rack, 300 degrees

In [190]:
cake_agg.head()

Unnamed: 0,recipe,position,temp,value
0,a,bottom,225,61.738655
1,a,top,225,51.709751
2,b,bottom,225,57.09532
3,b,top,225,82.455004
4,c,bottom,225,96.470207


In [198]:
cake_agg.max()

recipe            d
position        top
temp            300
value       99.2485
dtype: object