In [205]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### assessments.csv — this file contains information about scores in tests. Typically each subject includes a series of grades followed by a final exam.

- code_module — module identification code.
- code_presentation — semester identification code.
- id_assessment — assessment identification number.
- assessment_type — test type. There are three types of assessment:
    teacher assessment (TMA), computer-aided assessment (CMA), course exam (Exam).
- date — information about the final date of the test. 
    Calculated as the number of days since the beginning of the semester. 
    The start date of the semester is numbered 0 (zero).
- weight — assessment weight in %. Usually exams are considered separately and have a weight of 100%;
    the sum of all other scores is 100%.
    
#### courses.csv — the file contains a list of all available modules (courses) and their presentations.

- code_module - module identification code.
- code_presentation — semester identification code.
- module_presentation_length — semester length in days.

#### studentAssessment.csv — this file contains student test scores. If the student does not pass (does not turn in the work, does not send the result) test, the result is not written to the table. Final exams are not accepted, if the result of the pretests is missing from the system.

- id_assessment — assessment identification number.
- id_student — student identification number.
- date_submitted — the date the student submitted the assessment, measured as the number of days since the beginning of the semester.
- is_banked — the fact of passing the test in the last semester.
- score — the student's score on this test. The range is 0 to 100. A score below 40 is interpreted as a failure.

#### studentRegistration.csv — this file contains time information, when a student has registered for an intra-semester course.

- code_module - module identification code.
- code_presentation — semester identification code.
- id_student — student identification number.
- date_registration — date of registration of the student, this is the number of days,
- measured from the beginning of the semester (for example, a negative value of -30 means that the student registered for the course 30 days before the start of the course).
- date_unregistration — the date on which the student's enrollment in the module was canceled. For students who have completed the course, this field is left blank.

In [206]:
a = pd.read_csv('data/assessments.csv')
c = pd.read_csv('data/courses.csv')
st_a = pd.read_csv('data/studentAssessment.csv')
st_reg = pd.read_csv('data/studentRegistration.csv')

#### 1. How many students have successfully passed only one course? 
#### ("Successful passing" is having a successfully passed course exam)

To know how many students successfully passed only ONE course, we should take all the assessments that have "Exam" type and check which assessments of this type got more than 40 as a score. (A score below 40 is interpreted as a failure.)

In [207]:
# We may join the a and st_a dataframes on the id_assessment column. 
# Thus we will know which score each assessment got.

st_a_joined = st_a.join(a.set_index('id_assessment'), on='id_assessment')
st_a_joined

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score,code_module,code_presentation,assessment_type,date,weight
0,1752,11391,18,0,78.0,AAA,2013J,TMA,19.0,10.0
1,1752,28400,22,0,70.0,AAA,2013J,TMA,19.0,10.0
2,1752,31604,17,0,72.0,AAA,2013J,TMA,19.0,10.0
3,1752,32885,26,0,69.0,AAA,2013J,TMA,19.0,10.0
4,1752,38053,19,0,79.0,AAA,2013J,TMA,19.0,10.0
...,...,...,...,...,...,...,...,...,...,...
173907,37443,527538,227,0,60.0,GGG,2014J,CMA,229.0,0.0
173908,37443,534672,229,0,100.0,GGG,2014J,CMA,229.0,0.0
173909,37443,546286,215,0,80.0,GGG,2014J,CMA,229.0,0.0
173910,37443,546724,230,0,100.0,GGG,2014J,CMA,229.0,0.0


In [208]:
# From all the assessments we take only those of the type "Exam" that were successfully (score > 40) passed
# then group by id_student and aggregate, counting number of courses passed by each student,
# take the students whose number of courses passed is only 1
# and count total number of such students by taking the 0 indexed element from the .shape tuple.

one_course_succeeded = st_a_joined.query('assessment_type == "Exam" & score >= 40') \
                                .groupby('id_student') \
                                .date_submitted.agg(['count']) \
                                .rename(columns={'count': 'number_of_courses'}) \
                                .query('number_of_courses == 1') \
                                .shape[0]
print('ANSWER: {} students successfully passed only one course'.format(one_course_succeeded))

ANSWER: 3802 students successfully passed only one course


#### 2. Identify the hardest and easiest exams: find courses and exams within a course, which have the lowest and highest completion rates.

Completion = number of successfully passed exams / number of all attempts to pass the exam

In [209]:
# To solve this problem we should choose from the dataframe, joined in the first part of the task,
# assessments of type "Exam", group them by modules, semesters and id's,
# then count how many submissions of each exam took place.

exams_grouped = st_a_joined \
        .query('assessment_type == "Exam"') \
        .groupby(['code_module', 'code_presentation', 'id_assessment', 'assessment_type'], as_index=False) \
        .date_submitted.agg(['count']) \
        .rename(columns={'count': 'total_submitted'}) \
        .reset_index()
exams_grouped

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,total_submitted
0,CCC,2014B,24290,Exam,747
1,CCC,2014J,24299,Exam,1168
2,DDD,2013B,25340,Exam,602
3,DDD,2013J,25354,Exam,968
4,DDD,2014B,25361,Exam,524
5,DDD,2014J,25368,Exam,950


In [210]:
# We should also count the number of passed exams, using the same joined dataframe as the source of data,
# remembering that to pass an exam a student should score at least 40.

exams_passed = st_a_joined[st_a_joined['score'] >= 40] \
                .query('assessment_type == "Exam"') \
                .groupby('id_assessment') \
                .date_submitted.agg(['count']).rename(columns={'count': 'passed'}).reset_index()
exams_passed

Unnamed: 0,id_assessment,passed
0,24290,664
1,24299,1019
2,25340,504
3,25354,878
4,25361,485
5,25368,842


In [211]:
# Now we join two resulting dataframes to have 'total_submitted' and 'passed' columns in one dataframe

exams_results = exams_grouped.join(exams_passed.set_index('id_assessment'), on='id_assessment')
exams_results

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,total_submitted,passed
0,CCC,2014B,24290,Exam,747,664
1,CCC,2014J,24299,Exam,1168,1019
2,DDD,2013B,25340,Exam,602,504
3,DDD,2013J,25354,Exam,968,878
4,DDD,2014B,25361,Exam,524,485
5,DDD,2014J,25368,Exam,950,842


In [212]:
# We calculate completion rate by dividing the number of passed exams by the number of total exam submissions.

exams_results['completion_rate'] = round(exams_results['passed'] / exams_results['total_submitted'], 4)
exams_results

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,total_submitted,passed,completion_rate
0,CCC,2014B,24290,Exam,747,664,0.8889
1,CCC,2014J,24299,Exam,1168,1019,0.8724
2,DDD,2013B,25340,Exam,602,504,0.8372
3,DDD,2013J,25354,Exam,968,878,0.907
4,DDD,2014B,25361,Exam,524,485,0.9256
5,DDD,2014J,25368,Exam,950,842,0.8863


In [213]:
# To identify the easiest exam we take the 'id_assessment' with the highest 'completion_rate'.
# To add more information in the final output we extract the module, semester and the completion rate of the easiest exam.

easy_exam = exams_results.set_index('id_assessment')['completion_rate'].idxmax(axis=1)
easy_module = exams_results.query('id_assessment == @easy_exam')['code_module'].astype('string').values[0]
easy_semester = exams_results.query('id_assessment == @easy_exam')['code_presentation'].astype('string').values[0]
easy_completion_rate = exams_results.query('id_assessment == @easy_exam')['completion_rate'].values[0]

In [214]:
# Print formatted output

print('The easiest exam was the one with id {}, \
as a part of the module {} in the semester {}. \
Its completion rate was {}' \
.format(easy_exam, easy_module, easy_semester, easy_completion_rate))

The easiest exam was the one with id 25361, as a part of the module DDD in the semester 2014B. Its completion rate was 0.9256


In [215]:
# To identify the hardest exam we take the 'id_assessment' with the lowest 'completion_rate'.
# To add more information in the final output we extract the module, semester and the completion rate of the hardest exam.

hard_exam = exams_results.set_index('id_assessment')['completion_rate'].idxmin(axis=1)
hard_module = exams_results.query('id_assessment == @hard_exam')['code_module'].astype('string').values[0]
hard_semester = exams_results.query('id_assessment == @hard_exam')['code_presentation'].astype('string').values[0]
hard_completion_rate = exams_results.query('id_assessment == @hard_exam')['completion_rate'].values[0]

In [284]:
print('The hardest exam was the one with id {}, \
as a part of the module {} in the semester {}. \
Its completion rate was {}' \
.format(hard_exam, hard_module, hard_semester, hard_completion_rate))

The hardest exam was the one with id 25340, as a part of the module DDD in the semester 2013B. Its completion rate was 0.8372


#### 3. For each module, determine the average time for passing exams (by passing we mean the last successful passing of the exam by a student).

In [283]:
# To determine the average time for passing exams successfully, we take the assessments of type "Exam"
# with a score of at least 40 and aggregate its data by the 'date_submitted' column mean.
# date_submitted — the date the student submitted the assessment, 
# measured as the number of days since the beginning of the semester.

# We also round the mean, as such values as days are better interpreted being integers.

submission_term = st_a_joined.query('assessment_type == "Exam" & score >= 40') \
            .groupby(['code_module', 'code_presentation', 'id_assessment', 'assessment_type'], as_index=False) \
            .agg({'date_submitted': 'mean'}) \
            .rename(columns={'date_submitted': 'mean_submission_term'})
submission_term['mean_submission_term'] = round(submission_term['mean_submission_term']).astype('int32')
submission_term

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,mean_submission_term
0,CCC,2014B,24290,Exam,232
1,CCC,2014J,24299,Exam,244
2,DDD,2013B,25340,Exam,230
3,DDD,2013J,25354,Exam,240
4,DDD,2014B,25361,Exam,235
5,DDD,2014J,25368,Exam,243


In [282]:
# To make the formatted output we use a loop, limited by the length of the submission_term dataframe.

for i in range(submission_term.shape[0]):
    module = submission_term.loc[i]['code_module']
    presentation = submission_term.loc[i]['code_presentation']
    term = submission_term.loc[i]['mean_submission_term']
    print('The mean exam submission term for module {} in the {} semester was {} days'.format(module, presentation, term))

The mean exam submission term for module CCC in the 2014B semester was 232 days
The mean exam submission term for module CCC in the 2014J semester was 244 days
The mean exam submission term for module DDD in the 2013B semester was 230 days
The mean exam submission term for module DDD in the 2013J semester was 240 days
The mean exam submission term for module DDD in the 2014B semester was 235 days
The mean exam submission term for module DDD in the 2014J semester was 243 days


#### 4. Identify the most popular courses (TOP-3) by the number of registrations for them and courses with the largest outflow (TOP-3).

In [325]:
# To identify the most popular courses by the number of registrations 
# we may take the mean number of registrations for each semester to avoid
# possible errors (counting the same students in different semesters) 

# First we should also clear the dataframe st_reg from NAN data in the 'date_registration' column
# as well as possible duplicate students in each semester.

st_reg_no_nan = st_reg.dropna(subset=['date_registration'])
st_reg_cleared = st_reg_no_nan.drop_duplicates(subset=['code_module', 'code_presentation', 'id_student'])

# Now we group the data by module and semester, then count the number of students on each semester of each module
# and count the mean number of students attending each semester for each module.
# After doing all the calculations we output top-3 courses by their popularity.

st_reg_top_3 = st_reg_cleared.groupby(['code_module', 'code_presentation'], as_index=False) \
        .id_student.agg(['count']) \
        .rename(columns={'count': 'number_of_registrations'}) \
        .reset_index() \
        .groupby('code_module', as_index=False) \
        .agg({'number_of_registrations': 'mean'}) \
        .sort_values('number_of_registrations', ascending=False) \
        .round() \
        .reset_index(drop=True) \
        .head(3)
st_reg_top_3

Unnamed: 0,code_module,number_of_registrations
0,CCC,2213.0
1,BBB,1975.0
2,FFF,1938.0


In [332]:
# Now we make a formatted output.

module_1 = st_reg_top_3.loc[0]['code_module']
module_2 = st_reg_top_3.loc[1]['code_module']
module_3 = st_reg_top_3.loc[2]['code_module']
number_1 = st_reg_top_3.loc[0]['number_of_registrations'].astype('int32')
number_2 = st_reg_top_3.loc[1]['number_of_registrations'].astype('int32')
number_3 = st_reg_top_3.loc[2]['number_of_registrations'].astype('int32')

print('ANSWER: The top-3 most popular courses are {}, {} and {}. \
\nEach of their semesters was attended on average by {}, {} and {} students respectively' \
     .format(module_1, module_2, module_3, number_1, number_2, number_3))

ANSWER: The top-3 most popular courses are CCC, BBB and FFF. 
Each of their semesters was attended on average by 2213, 1975 and 1938 students respectively
