In [2]:
%%capture
from functions import *

@register_cell_magic
def markdown(line, cell):
    return md(cell.format(**globals()))

# Student Registration Dataframe

---
The student registration dataframe contains information about the dates that students registered and,if applicable, unregistered from the module.

### Contents

* **code_module**: The code module represents the course which the sutdent registered for.
* **code_presentation**: The code presentation represents the time of year the course which the student registered for began.
* **id_student**: The student ID is the unique identifier for each student.
* **date_registration**: The registration date is the date that the student registered for the module relative to the start of the module. A negative value indicates that many days before the module began.
* **date_unregistration**: The unregistration date is the date that the student unregistered from the course module in relation to the start date of the course, if applicable.


In [3]:
# looking at the student_registration dataframe
student_registration.head()

Unnamed: 0,code_module,code_presentation,id_student,date_registration,date_unregistration
0,AAA,2013J,11391,-159.0,
1,AAA,2013J,28400,-53.0,
2,AAA,2013J,30268,-92.0,12.0
3,AAA,2013J,31604,-52.0,
4,AAA,2013J,32885,-176.0,


In [4]:
md(f'''

**Size**
    
* Number of Rows: {len(student_registration)}
* Number of Columns: {len(student_registration.columns)}

**Data Types**
''')



<b>Size</b>
    
* Number of Rows: 32593
* Number of Columns: 5

<b>Data Types</b>


In [6]:
# show student info data types
student_registration.dtypes

code_module             object
code_presentation       object
id_student               int64
date_registration      float64
date_unregistration    float64
dtype: object

* id_student is currently an int64 datatype, but would be more appropriate as an object data type since it is categorical.

In [7]:
# changing id_student to the object data type
student_registration['id_student'] = student_registration['id_student'].astype(object)

**Null Values:**

In [8]:
student_registration.isnull().sum()

code_module                0
code_presentation          0
id_student                 0
date_registration         45
date_unregistration    22521
dtype: int64

In [15]:
null_registration = student_registration['date_registration'].isnull().sum()
null_unregistration = student_registration['date_unregistration'].isnull().sum()

In [16]:
%%markdown 

* We have {null_registration} null values for date_registration, and no mention of this in the dataset documentation, so we will treat this as missing data.
* There are {null_unregistration} null values for date_unregistration which represent the students that did not withdraw from the course.


* We have 45 null values for date_registration, and no mention of this in the dataset documentation, so we will treat this as missing data.
* There are 22521 null values for date_unregistration which represent the students that did not withdraw from the course.


**Unique Counts:**

In [17]:
student_registration.nunique()

code_module                7
code_presentation          4
id_student             28785
date_registration        332
date_unregistration      416
dtype: int64

**Unique Categorical Values**

In [18]:
unique_vals(student_registration)

code_module: ['AAA' 'BBB' 'CCC' 'DDD' 'EEE' 'FFF' 'GGG']

code_presentation: ['2013J' '2014J' '2013B' '2014B']

id_student: [11391 28400 30268 ... 2648187 2679821 2684003]



In imd_band the % sign is missing in 10-20. We will add that for consistency and clarity

In [71]:
# changing all 10-20 values in student_info imd_band to 10-20% for consistency's sake
student_info.loc[student_info['imd_band'] == '10-20', 'imd_band'] = '10-20%'
print(student_info['imd_band'].explode().unique())

['90-100%' '20-30%' '30-40%' '50-60%' '80-90%' '70-80%' nan '60-70%'
 '40-50%' '10-20%' '0-10%']


**Duplicate Values**

In [56]:
analyze_df(student_registration, dupes=True)

'No Duplicate Values'

In [57]:
md(f'''* The Student info dataframe is {len(student_registration)} rows, but there are only {student_registration['id_student'].nunique()} unique student ID's.
* This suggests that there are some students who took multiple modules since we eliminated those who have taken the same course more than once.
        ''')

* The Student info dataframe is 28421 rows, but there are only 26096 unique student ID's.
* This suggests that there are some students who took multiple modules since we eliminated those who have taken the same course more than once.
        

In [67]:
student_registration[student_registration['id_student'].duplicated()].head()

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,date_registration,date_unregistration
11212,CCC,2014J,533512,East Anglian Region,40-50%,0-35,F,Lower Than A Level,N,Fail,-23.0,
12450,CCC,2014J,675116,London Region,30-40%,0-35,F,Lower Than A Level,N,Withdrawn,-98.0,-96.0
13110,DDD,2013B,86047,Wales,20-30%,0-35,F,HE Qualification,N,Pass,-60.0,
13137,DDD,2013B,131145,South West Region,40-50%,0-35,M,A Level or Equivalent,N,Pass,-103.0,
13140,DDD,2013B,134025,London Region,60-70%,0-35,M,A Level or Equivalent,N,Distinction,-58.0,


**Duplicate Student ID's**

In [61]:
# finding student records with duplicate ID's
pd.concat(x for _, x in student_registration.groupby("id_student") if len(x) > 1).head()

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,date_registration,date_unregistration
10598,CCC,2014J,29411,East Midlands Region,80-90%,0-35,M,A Level or Equivalent,N,Withdrawn,-135.0,100.0
14399,DDD,2013J,29411,East Midlands Region,80-90%,0-35,M,A Level or Equivalent,N,Pass,-96.0,
10600,CCC,2014J,29639,North Region,,0-35,M,Lower Than A Level,N,Pass,-24.0,
20417,EEE,2014B,29639,North Region,,0-35,M,Lower Than A Level,N,Pass,-26.0,
8659,CCC,2014B,29820,East Anglian Region,40-50%,0-35,M,HE Qualification,N,Pass,-57.0,


In [68]:
duped_sids = student_registration[student_registration['id_student'].duplicated()]
total_sid_dupes = pd.concat(x for _, x in student_registration.groupby("id_student") if len(x) > 1)

In [70]:
md(f'''We have {len(duped_sids)} students whose ID is listed more than once and a total of {len(total_sid_dupes)} duplicate records. These students do seem to be in different courses, and so we will leave them''')

We have 2325 students whose ID is listed more than once and a total of 4636 duplicate records. These students do seem to be in different courses, and so we will leave them

**Statistics:**

In [40]:
student_registration.describe().astype(int)

Unnamed: 0,date_registration,date_unregistration
count,28383,8612
mean,-68,49
std,48,81
min,-321,-274
25%,-100,-2
50%,-56,27
75%,-29,107
max,167,444


* There are 8,612 values for the count of date_unregistration which represents the number of students who withdrew from the course.
* The earliest date_unregistration date is 274 days before the course began, which means these students did not make it to the first day. We are only interested in students who took the course so we must eliminate students who did not attend.

In [45]:
# removing students who withdrew on or before the first day
student_registration = student_registration.drop(student_registration[(student_registration['date_unregistration'] <= 0)].index)
student_registration.reset_index(drop=True).head()

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,date_registration,date_unregistration
0,AAA,2013J,11391,East Anglian Region,90-100%,55<=,M,HE Qualification,N,Pass,-159.0,
1,AAA,2013J,28400,Scotland,20-30%,35-55,F,HE Qualification,N,Pass,-53.0,
2,AAA,2013J,30268,North Western Region,30-40%,35-55,F,A Level or Equivalent,Y,Withdrawn,-92.0,12.0
3,AAA,2013J,31604,South East Region,50-60%,35-55,F,A Level or Equivalent,N,Pass,-52.0,
4,AAA,2013J,32885,West Midlands Region,50-60%,0-35,F,Lower Than A Level,N,Pass,-176.0,


In [46]:
# finds the longest module length in courses and prints it
longest_course = courses['module_presentation_length'].max()
longest_unreg = student_registration['date_unregistration'].max().astype(int)
md(f'''* The longest course from module_presentation length in the courses dataframe was {longest_course} days, yet we see here the latest unregistration date is {longest_unreg} days, which is longer than any course went on.
    ''')

* The longest course from module_presentation length in the courses dataframe was 269 days, yet we see here the latest unregistration date is 444 days, which is longer than any course went on.
    

**All Students with an unregistration point after 269 days:**

In [47]:
# finding students whose courses went on for longer than the maximum course length
student_registration.loc[student_registration['date_unregistration'] > 269]

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,date_registration,date_unregistration
25249,FFF,2013J,586851,Wales,0-10%,0-35,M,Lower Than A Level,N,Withdrawn,-22.0,444.0


* It seems to be just this one student is an outlier, but should not affect our overall analysis so we will leave this intact