In [1]:
import pandas as pd
import numpy as np

## 1.) Attendance Data

Load the attendance.csv file and calculate an attendance percentage for each student. One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

In [2]:
attendance = pd.read_csv('untidy_data/attendance.csv')
attendance.head()

Unnamed: 0.1,Unnamed: 0,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


In [3]:
attendance = attendance.replace(to_replace=['A','P','T','H'], value = [0,1,.9,.5])
attendance.head()

Unnamed: 0.1,Unnamed: 0,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,1,0.9,0.9,0.5,1.0,0.0,0.9,0.9
1,Jane,0,1.0,0.9,0.9,0.9,0.9,0.0,0.9
2,Billy,0,0.9,0.0,0.0,0.5,0.9,1.0,0.9
3,John,1,0.9,0.5,1.0,1.0,0.9,1.0,1.0


In [4]:
attendance.melt(id_vars = 'Unnamed: 0', var_name = 'date', value_name = 'score' ).set_index('date').pivot_table(columns='Unnamed: 0')

Unnamed: 0,Billy,Jane,John,Sally
score,0.525,0.6875,0.9125,0.7625


## 2.) Coffee Levels

- Read the coffee_levels.csv file.

In [5]:
coffee = pd.read_csv('untidy_data/coffee_levels.csv')
coffee.head()

Unnamed: 0,hour,coffee_carafe,coffee_amount
0,8,x,0.816164
1,9,x,0.451018
2,10,x,0.843279
3,11,x,0.335533
4,12,x,0.898291


- Transform the data so that each carafe is in it's own column.

In [6]:
coffee.pivot(index='hour', columns= 'coffee_carafe', values = 'coffee_amount')

coffee_carafe,x,y,z
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,0.816164,0.189297,0.999264
9,0.451018,0.521502,0.91599
10,0.843279,0.023163,0.144928
11,0.335533,0.235529,0.311495
12,0.898291,0.017009,0.771947
13,0.310711,0.997464,0.39852
14,0.507288,0.058361,0.864464
15,0.215043,0.144644,0.436364
16,0.183891,0.544676,0.280621
17,0.39156,0.594126,0.436677


- Is this the best shape for the data?

    It depends. This shape is not tidy because one variable (carafe) is spread across three columns. This shape however, is helpful for plotting and exploratory uses. 

## 3.) Cake Recipes

- Read the cake_recipes.csv data. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.

In [7]:
cake = pd.read_csv('untidy_data/cake_recipes.csv')
cake.head()

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847
2,b:bottom,57.09532,61.904369,61.19698,99.248541
3,b:top,82.455004,95.224151,98.594881,58.169349
4,c:bottom,96.470207,52.001358,92.893227,65.473084


- Tidy the data as necessary.

In [8]:
# Create new df to split recipe and position
recipe_and_position = cake['recipe:position'].str.split(':', expand=True)
recipe_and_position.columns = ['recipe', 'position']
recipe_and_position.head()

Unnamed: 0,recipe,position
0,a,bottom
1,a,top
2,b,bottom
3,b,top
4,c,bottom


In [9]:
# Concat recipe and position back to original df and drop combined column
cake = pd.concat([cake, recipe_and_position], sort = True, axis = 1).drop(columns=('recipe:position'))
cake.head()

Unnamed: 0,225,250,275,300,recipe,position
0,61.738655,53.912627,74.41473,98.786784,a,bottom
1,51.709751,52.009735,68.576858,50.22847,a,top
2,57.09532,61.904369,61.19698,99.248541,b,bottom
3,82.455004,95.224151,98.594881,58.169349,b,top
4,96.470207,52.001358,92.893227,65.473084,c,bottom


In [10]:
cake.pivot_table(index=['recipe', 'position'])

Unnamed: 0_level_0,Unnamed: 1_level_0,225,250,275,300
recipe,position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,bottom,61.738655,53.912627,74.41473,98.786784
a,top,51.709751,52.009735,68.576858,50.22847
b,bottom,57.09532,61.904369,61.19698,99.248541
b,top,82.455004,95.224151,98.594881,58.169349
c,bottom,96.470207,52.001358,92.893227,65.473084
c,top,71.306308,82.795477,92.098049,53.960273
d,bottom,52.799753,58.670419,51.747686,56.18311
d,top,96.873178,76.101363,59.57162,50.971626


- Which recipe, on average, is the best? 

    recipe b

In [11]:
cake.pivot_table(index='recipe').mean(axis=1).idxmax()

'b'

- Which oven temperature, on average, produces the best results? 

    275

In [12]:
cake.pivot_table(index=['recipe']).mean().idxmax()

'275'

- Which combination of recipe, rack position, and temperature gives the best result? 

    recipe b, bottom rack, 300 degrees

In [13]:
temp = cake.melt(id_vars=['recipe','position'],value_vars=['225','250','275','300'],var_name='temp')

In [14]:
temp.head()

Unnamed: 0,recipe,position,temp,value
0,a,bottom,225,61.738655
1,a,top,225,51.709751
2,b,bottom,225,57.09532
3,b,top,225,82.455004
4,c,bottom,225,96.470207


In [15]:
temp.iloc[temp.value.idxmax()]

recipe            b
position     bottom
temp            300
value       99.2485
Name: 26, dtype: object