In [1]:
from functions import *
from ipynb.fs.full.Student_Info import student_info
from ipynb.fs.full.Student_Registration import student_registration

<a id='StudentInfo'></a>

# Student Info and Student Registration

---

```{note}
* The student registration dataframe matches 1:1 with the student_info dataframe only adding the date the student registered and the date, if applicable, they unregistered, and so we will merge these two dataframes
* Though the number of previous attempts may be interesting to analyze on its own to see the relationship between students who had to take the course multiple times, and the differences in their behavior on the second or higher attempt, here we are only interested in students on their first attempt. The reason is that familiarity with course content is a confounding variable. Due to this we will remove students on their second or higher attempt. We will then remove num_prev_attempts since it will not contain any interesting data.
* The dataframe columns can then be reordered to keep relevent data together. 
```

In [22]:
# outer join and merge student info with student registration
student_info_reg = student_info.merge(student_registration, how='outer', on=['code_module', 'code_presentation', 'id_student'], indicator=True)

# locating where there is student info that did not match with student registration
only_student_info = student_info_reg.loc[student_info_reg['_merge']=='left_only']
# changing the student info dataframe to include only records where num_prev_attempts is 
# student_info_reg = student_info_reg[student_info_reg['num_of_prev_attempts'] == 0]



**Updated Dataframe**

We removed the students that did not attend from student registration and so there will be students in student info who no longer map to a registration date in student registration

In [23]:
student_info_reg.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,prev_attempts,studied_credits,disability,final_result,date_registration,date_unregistration,_merge
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,False,Pass,-159,,both
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,False,Pass,-53,,both
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,True,Withdrawn,-92,12.0,both
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,False,Pass,-52,,both
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,False,Pass,-176,,both


In [24]:
only_student_info.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,prev_attempts,studied_credits,disability,final_result,date_registration,date_unregistration,_merge
125,AAA,2013J,292923,F,South East Region,A Level or Equivalent,90-100%,35-55,0,180,False,Withdrawn,,,left_only
136,AAA,2013J,305539,F,Wales,Lower Than A Level,80-90%,0-35,0,120,False,Withdrawn,,,left_only
198,AAA,2013J,405961,M,Scotland,A Level or Equivalent,90-100%,0-35,0,240,True,Withdrawn,,,left_only
256,AAA,2013J,1763015,F,Scotland,A Level or Equivalent,10-20%,35-55,0,60,False,Withdrawn,,,left_only
298,AAA,2013J,2318055,M,Wales,A Level or Equivalent,90-100%,35-55,0,60,False,Withdrawn,,,left_only


In [25]:
md(f'''
* Here is a dataframe of the {"{:,}".format(len(only_student_info))} students who we eliminated from student registration for not having attended the course.
* Here we will eliminate those with left_only as a _merge value from the dataframe
''')


* Here is a dataframe of the 3,097 students who we eliminated from student registration for not having attended the course.
* Here we will eliminate those with left_only as a _merge value from the dataframe


In [26]:
student_info_reg = student_info_reg[student_info_reg['_merge'] != 'left_only']

In [27]:
### reordering the student_info dataframe to keep country, module and student data together
student_info_reg = student_info_reg[['code_module', 'code_presentation', 'id_student', 'region', 'imd_band', 'age_band', 'gender', 'highest_education', 'disability', 'final_result', 'date_registration', 'date_unregistration']]

In [28]:
# looking at our now merged dataframe
student_info_reg.head()

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,date_registration,date_unregistration
0,AAA,2013J,11391,East Anglian Region,90-100%,55<=,M,HE Qualification,False,Pass,-159,
1,AAA,2013J,28400,Scotland,20-30%,35-55,F,HE Qualification,False,Pass,-53,
2,AAA,2013J,30268,North Western Region,30-40%,35-55,F,A Level or Equivalent,True,Withdrawn,-92,12.0
3,AAA,2013J,31604,South East Region,50-60%,35-55,F,A Level or Equivalent,False,Pass,-52,
4,AAA,2013J,32885,West Midlands Region,50-60%,0-35,F,Lower Than A Level,False,Pass,-176,


In [29]:
get_size(student_info_reg)

Unnamed: 0,Count
Columns,12
Rows,29496


In [13]:
# changing id_student to the object data type
student_info_reg['id_student'] = student_info_reg['id_student'].astype(object)

**Null Values:**

In [31]:
student_info_reg.isnull().sum()

code_module                0
code_presentation          0
id_student                 0
region                     0
imd_band                1054
age_band                   0
gender                     0
highest_education          0
disability                 0
final_result               0
date_registration         10
date_unregistration    22521
dtype: int64

* The imd_band variable has 990 null values which we may have to work around. 
* There are 19,809 null values for date_unregistration which represent the students that did not withdraw from the course.
* We have 38 null values for date_registration, and no mention of this in the dataset documentation, so we will treat this as missing data.

**Unique Counts:**

In [34]:
count_unique(student_info_reg)

index,Count
code_module,7
code_presentation,4
id_student,26358
region,13
imd_band,10
age_band,3
gender,2
highest_education,5
disability,2
final_result,4


**Unique Categorical Values**

In [35]:
unique_vals(student_info_reg)

index,Values
code_module,"['AAA', 'BBB', 'CCC', 'DDD', 'EEE', 'FFF', 'GGG']"
code_presentation,"['2013J', '2014J', '2013B', '2014B']"
region,"['East Anglian Region', 'Scotland', 'North Western Region', 'South East Region', 'West Midlands Region', 'Wales', 'North Region', 'South Region', 'Ireland', 'South West Region', 'East Midlands Region', 'Yorkshire Region', 'London Region']"
imd_band,"['90-100%', '20-30%', '30-40%', '50-60%', '80-90%', '70-80%', , '60-70%', '40-50%', '10-20%', '0-10%']"
age_band,"['55<=', '35-55', '0-35']"
gender,"['M', 'F']"
highest_education,"['HE Qualification', 'A Level or Equivalent', 'Lower Than A Level', 'Post Graduate Qualification', 'No Formal quals']"
final_result,"['Pass', 'Withdrawn', 'Fail', 'Distinction']"


**Duplicate Values**

In [36]:
get_dupes(student_info_reg)

There are no Duplicate Values

**Statistics:**

In [41]:
student_info_reg.describe().round(1)

Unnamed: 0,disability,date_registration,date_unregistration
count,29496,29486.0,6975.0
unique,2,,
top,False,,
freq,26662,,
mean,,-66.6,86.9
std,,47.7,67.4
min,,-311.0,1.0
25%,,-95.0,26.0
50%,,-53.0,73.0
75%,,-29.0,142.0


In [18]:
# removing students who withdrew on or before the first day
student_info_reg = student_info_reg.drop(student_info_reg[(student_info_reg['date_unregistration'] <= 0)].index)
student_info_reg.reset_index(drop=True).head()

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,date_registration,date_unregistration
0,AAA,2013J,11391,East Anglian Region,90-100%,55<=,M,HE Qualification,N,Pass,-159.0,
1,AAA,2013J,28400,Scotland,20-30%,35-55,F,HE Qualification,N,Pass,-53.0,
2,AAA,2013J,30268,North Western Region,30-40%,35-55,F,A Level or Equivalent,Y,Withdrawn,-92.0,12.0
3,AAA,2013J,31604,South East Region,50-60%,35-55,F,A Level or Equivalent,N,Pass,-52.0,
4,AAA,2013J,32885,West Midlands Region,50-60%,0-35,F,Lower Than A Level,N,Pass,-176.0,


In [22]:
# finds the longest module length in courses and prints it
longest_course = courses['module_presentation_length'].max()
longest_unreg = int(student_info_reg['date_unregistration'].max())
md(f'''* The longest course from module_presentation length in the courses dataframe was {longest_course} days, yet we see here the latest unregistration date is {longest_unreg} days, which is longer than any course went on.
    ''')

* The longest course from module_presentation length in the courses dataframe was 269 days, yet we see here the latest unregistration date is 444 days, which is longer than any course went on.
    

**All Students with an unregistration point after 269 days:**

In [23]:
# finding students whose courses went on for longer than the maximum course length
student_info_reg.loc[student_info_reg['date_unregistration'] > 269]

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,date_registration,date_unregistration
25249,FFF,2013J,586851,Wales,0-10%,0-35,M,Lower Than A Level,N,Withdrawn,-22.0,444.0


* It seems to be just this one student is an outlier, but should not affect our overall analysis so we will leave this intact