In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandahouse as ph

In [12]:
a = pd.read_csv('data/assessments.csv')

# assessments.csv — this file contains information about scores in tests.
# Typically each subject includes a series of grades followed by a final exam.

    # code_module — module identification code.

    # code_presentation — semester (identification code).

    # id_assessment — test (assessment identification number).

    # assessment_type — test type. There are three types of assessment:
    # teacher assessment (TMA), computer-aided assessment (CMA), course exam (Exam).

    # date — information about the final date of the test. 
    # Calculated as the number of days since the beginning of the semester. 
    # The start date of the semester is numbered 0 (zero).

    # weight — assessment weight in %. Usually exams are considered separately and have a weight of 100%;
    # the sum of all other scores is 100%.

c = pd.read_csv('data/courses.csv')

# courses.csv — the file contains a list of all available modules (courses) and their presentations.

    # code_module - module (identification code).

    # code_presentation — semester (identification code).

    # module_presentation_length — semester length in days.

st_a = pd.read_csv('data/studentAssessment.csv')

# studentAssessment.csv — this file contains student test scores.
# If the student does not pass (does not turn in the work, does not send the result) test,
# the result is not written to the table. Final exams are not accepted,
# if the result of the pretests is missing from the system.

    # id_assessment — test (identification number).

    # id_student — student identification number.

    # date_submitted — the date the student applied, measured as the number of days since the beginning of the semester.

    # is_banked — the fact of passing the test in the last semester.

    # score — the student's score on this test. The range is 0 to 100. A score below 40 is interpreted as a failure.

st_reg = pd.read_csv('data/studentRegistration.csv')


# studentRegistration.csv — this file contains time information,
# when a student has registered for an intra-semester course.

    # code_module - module (identification code).

    # code_presentation — semester (identification code).
    
    # id_student — student identification number.

    # date_registration — date of registration of the student, this is the number of days,
    # measured from the beginning of the semester (for example, a negative value of -30 means
    # that the student registered for the course 30 days before the start of the course).

    # date_unregistration — the date on which the student's enrollment in the module was canceled.
    # For students who have completed the course, this field is left blank.

In [None]:
# 1. How many students have successfully passed only one course?
# ("Successful passing" is having a successfully passed course exam)

In [38]:
# To know how many students successfully passed only ONE course, we should take all the assessments 
# that have "Exam" type and check which assessments of this type got more than 40 as a score.
# (A score below 40 is interpreted as a failure.)

# To do that we may join the a and st_a dataframes on id_assessment column. 
# Thus we will know which score each assessment got.

st_a_joined = st_a.join(a.set_index('id_assessment'), on='id_assessment')
st_a_joined

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score,code_module,code_presentation,assessment_type,date,weight
0,1752,11391,18,0,78.0,AAA,2013J,TMA,19.0,10.0
1,1752,28400,22,0,70.0,AAA,2013J,TMA,19.0,10.0
2,1752,31604,17,0,72.0,AAA,2013J,TMA,19.0,10.0
3,1752,32885,26,0,69.0,AAA,2013J,TMA,19.0,10.0
4,1752,38053,19,0,79.0,AAA,2013J,TMA,19.0,10.0
...,...,...,...,...,...,...,...,...,...,...
173907,37443,527538,227,0,60.0,GGG,2014J,CMA,229.0,0.0
173908,37443,534672,229,0,100.0,GGG,2014J,CMA,229.0,0.0
173909,37443,546286,215,0,80.0,GGG,2014J,CMA,229.0,0.0
173910,37443,546724,230,0,100.0,GGG,2014J,CMA,229.0,0.0


In [75]:
# From all the assessments we take only those of the type "Exam" that were successfully (score > 40) passed
# then group by id_student and aggregate, counting number of courses passed by each student,
# take the students whose number of courses passed is only 1
# and count total number of such students by taking the 0 indexed element from the .shape tuple.

one_course_succeeded = st_a_joined.query('assessment_type == "Exam" & score >= 40') \
                                .groupby('id_student') \
                                .date_submitted.agg(['count']) \
                                .rename(columns={'count': 'number_of_courses'}) \
                                .query('number_of_courses == 1') \
                                .shape[0]
print('ANSWER: {} students successfully passed only one course'.format(one_course_succeeded))

ANSWER: 3802 students successfully passed only one course
