### Import Pandas

In [1]:
import pandas as pd

### Import File

In [6]:
df = pd.read_csv('PD Fill the Blanks challenge.csv')
df

Unnamed: 0,Weekday,Week,Teacher,Lesson Time,Lesson Name,Subject,Attendance
0,Monday,1,J. Martin,09:00,Statistics,Maths,24
1,Monday,2,J. Martin,09:00,Statistics,Maths,12
2,Monday,3,J. Martin,09:00,,,20
3,Monday,4,J. Martin,09:00,,,24
4,Monday,1,T. Prowse,14:00,Literature,English,21
5,Monday,2,T. Prowse,14:00,Literature,English,15
6,Monday,3,T. Prowse,14:00,Literature,English,13
7,Monday,4,T. Prowse,14:00,,,17
8,Tuesday,1,C. Allchin,09:00,Chemistry,Science,13
9,Tuesday,2,C. Allchin,09:00,,,30


### Create lookup sheet to use to fill in the blanks

The blanks are entirely in the Lesson Name and Subject fields, so you can look up the correct value for these fields using a triple key on Weekday, Teacher, and Lesson Time

In [10]:
# Use dropna to remove all NaNs in Subject and Lesson Name
lookup = df.dropna(subset=['Lesson Name'])
lookup = lookup.dropna(subset=['Subject'])

In [16]:
# Use groupby to remove duplicates for the keys
lookup = lookup.groupby(by=['Weekday', 'Teacher', 'Lesson Time', 'Lesson Name', 'Subject']).mean().reset_index()
lookup

Unnamed: 0,Weekday,Teacher,Lesson Time,Lesson Name,Subject,Week,Attendance
0,Friday,J. Martin,09:00,Discrete,Maths,1.5,20.0
1,Friday,T. Prowse,14:00,Prose,English,2.0,25.0
2,Monday,J. Martin,09:00,Statistics,Maths,1.5,18.0
3,Monday,T. Prowse,14:00,Literature,English,2.0,16.333333
4,Thursday,C. Allchin,09:00,Biology,Science,2.666667,22.0
5,Thursday,O. Feather,14:00,Basketball,Sports,4.0,13.0
6,Tuesday,C. Allchin,09:00,Chemistry,Science,2.666667,22.0
7,Tuesday,O. Feather,14:00,Rugby,Sports,3.0,27.0
8,Wednesday,J. Martin,09:00,Algebra,Maths,2.5,18.0
9,Wednesday,T. Prowse,14:00,Poetry,English,1.0,14.0


### Join lookup table to orginal table to fill in the information

1. Drop the Lesson Name and Subject rows from the original df first
2. Drop the already averaged columns from the lookup table - they will not contain the correct answer due to removed data
3. Join tables to put corrected columns back in

In [13]:
df = df.drop('Lesson Name', axis=1)
df = df.drop('Subject', axis=1)
df

Unnamed: 0,Weekday,Week,Teacher,Lesson Time,Attendance
0,Monday,1,J. Martin,09:00,24
1,Monday,2,J. Martin,09:00,12
2,Monday,3,J. Martin,09:00,20
3,Monday,4,J. Martin,09:00,24
4,Monday,1,T. Prowse,14:00,21
5,Monday,2,T. Prowse,14:00,15
6,Monday,3,T. Prowse,14:00,13
7,Monday,4,T. Prowse,14:00,17
8,Tuesday,1,C. Allchin,09:00,13
9,Tuesday,2,C. Allchin,09:00,30


In [17]:
lookup = lookup.drop('Week', axis=1)
lookup = lookup.drop('Attendance', axis=1)
lookup

Unnamed: 0,Weekday,Teacher,Lesson Time,Lesson Name,Subject
0,Friday,J. Martin,09:00,Discrete,Maths
1,Friday,T. Prowse,14:00,Prose,English
2,Monday,J. Martin,09:00,Statistics,Maths
3,Monday,T. Prowse,14:00,Literature,English
4,Thursday,C. Allchin,09:00,Biology,Science
5,Thursday,O. Feather,14:00,Basketball,Sports
6,Tuesday,C. Allchin,09:00,Chemistry,Science
7,Tuesday,O. Feather,14:00,Rugby,Sports
8,Wednesday,J. Martin,09:00,Algebra,Maths
9,Wednesday,T. Prowse,14:00,Poetry,English


In [18]:
joined = pd.merge(df, lookup, on=['Weekday', 'Teacher', 'Lesson Time'])
joined

Unnamed: 0,Weekday,Week,Teacher,Lesson Time,Attendance,Lesson Name,Subject
0,Monday,1,J. Martin,09:00,24,Statistics,Maths
1,Monday,2,J. Martin,09:00,12,Statistics,Maths
2,Monday,3,J. Martin,09:00,20,Statistics,Maths
3,Monday,4,J. Martin,09:00,24,Statistics,Maths
4,Monday,1,T. Prowse,14:00,21,Literature,English
5,Monday,2,T. Prowse,14:00,15,Literature,English
6,Monday,3,T. Prowse,14:00,13,Literature,English
7,Monday,4,T. Prowse,14:00,17,Literature,English
8,Tuesday,1,C. Allchin,09:00,13,Chemistry,Science
9,Tuesday,2,C. Allchin,09:00,30,Chemistry,Science


### Find average attendance per lesson and subject per weekday

In [20]:
avg = joined.groupby(by=['Weekday', 'Lesson Name', 'Subject']).mean().reset_index()
avg

Unnamed: 0,Weekday,Lesson Name,Subject,Week,Attendance
0,Friday,Discrete,Maths,2.5,19.0
1,Friday,Prose,English,2.5,20.75
2,Monday,Literature,English,2.5,16.5
3,Monday,Statistics,Maths,2.5,20.0
4,Thursday,Basketball,Sports,2.5,20.0
5,Thursday,Biology,Science,2.5,20.75
6,Tuesday,Chemistry,Science,2.5,24.0
7,Tuesday,Rugby,Sports,2.5,28.25
8,Wednesday,Algebra,Maths,2.5,14.25
9,Wednesday,Poetry,English,2.5,20.75


### Join average back to filled in data

1. Drop 'Week' Column from avg df
2. Rename Attendance column for clarity
3. Join df on Weekday, Lesson Name, and Subject

In [21]:
avg = avg.drop('Week', axis=1)

In [22]:
avg = avg.rename(columns={'Attendance' : 'Avg. Attendance per Subject and Lesson'})
avg

Unnamed: 0,Weekday,Lesson Name,Subject,Avg. Attendance per Subject and Lesson
0,Friday,Discrete,Maths,19.0
1,Friday,Prose,English,20.75
2,Monday,Literature,English,16.5
3,Monday,Statistics,Maths,20.0
4,Thursday,Basketball,Sports,20.0
5,Thursday,Biology,Science,20.75
6,Tuesday,Chemistry,Science,24.0
7,Tuesday,Rugby,Sports,28.25
8,Wednesday,Algebra,Maths,14.25
9,Wednesday,Poetry,English,20.75


In [23]:
joined2 = pd.merge(joined, avg, on=['Weekday', 'Lesson Name', 'Subject'])
joined2

Unnamed: 0,Weekday,Week,Teacher,Lesson Time,Attendance,Lesson Name,Subject,Avg. Attendance per Subject and Lesson
0,Monday,1,J. Martin,09:00,24,Statistics,Maths,20.0
1,Monday,2,J. Martin,09:00,12,Statistics,Maths,20.0
2,Monday,3,J. Martin,09:00,20,Statistics,Maths,20.0
3,Monday,4,J. Martin,09:00,24,Statistics,Maths,20.0
4,Monday,1,T. Prowse,14:00,21,Literature,English,16.5
5,Monday,2,T. Prowse,14:00,15,Literature,English,16.5
6,Monday,3,T. Prowse,14:00,13,Literature,English,16.5
7,Monday,4,T. Prowse,14:00,17,Literature,English,16.5
8,Tuesday,1,C. Allchin,09:00,13,Chemistry,Science,24.0
9,Tuesday,2,C. Allchin,09:00,30,Chemistry,Science,24.0


### Output to csv

In [24]:
joined2.to_csv('pd2022w11_solution.csv')