In [2]:
from functions import *
from ipynb.fs.full.Student_Info import student_info
from ipynb.fs.full.Assessments import assessments

# Student Assessment

---

The Student Assessments dataframe contains information about each student and the assessments they took during the module

In [3]:
student_assessment.head()

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score
0,1752,11391,18,0,78.0
1,1752,28400,22,0,70.0
2,1752,31604,17,0,72.0
3,1752,32885,26,0,69.0
4,1752,38053,19,0,79.0


---

## Student Assessment Contents

* **id_assessment**: The assessment ID is the unique identifier for the assessment the student took.
* **id_student**: The student ID is the unique identifier for the student who took the assessment.
* **date_submitted**: The date submitted is the date the student submitted the exam relevant to the start date of the module.
* **is_banked**: Whether the score for the assessment is banked indicates wheter the assessment result was transferred from a previous presentation.
    - is_banked has no relevant information to our analysis and can be removed
* **score**: The score the student received for the assessment. 40 or above is considered a passing score.

---

## Student Assessment Information

**Size**

In [5]:
# get the size of student_assessment
get_size(student_assessment)

Unnamed: 0,Count
Columns,4
Rows,173912


In [6]:
# store the size of student_assessment's columns
sa_cols = len(student_assessment.columns)
# store the size of student_assessment's rows
sa_rows = len(student_assessment)
md(f'''
Student Assessment has {sa_cols} columns and {"{:,}".format(sa_rows)} rows, which is how many exams we have data for.
''')


Student Assessment has 4 columns and 173,912 rows, which is how many exams we have data for.


**Data Types**

In [7]:
# get student_assessment column datatypes
get_dtypes(student_assessment)

index,Type
id_assessment,int64
id_student,int64
date_submitted,int64
score,float64


* `id_student` and `id_assessments` are both categorical values and so should be converted from `int64` to `string`

In [8]:
# converting the data types
student_assessment = student_assessment.astype({'id_assessment': str, 'id_student': str})
# change student_assessment datatypes to values pandas supports better
student_assessment = student_assessment.convert_dtypes()
student_assessment.dtypes

id_assessment     string
id_student        string
date_submitted     Int64
score              Int64
dtype: object

**Null Values**

In [9]:
# get null values if any
null_vals(student_assessment)

index,Null Values
id_assessment,0
id_student,0
date_submitted,0
score,173


In [10]:
null_score = student_assessment['score'].isnull().sum()
md(f'''
* We have {null_score} null values for score, which is important as it is a value we will be trying to predict.
''')


* We have 173 null values for score, which is important as it is a value we will be trying to predict.


In [11]:
# make a dataframe of students with a score of 0 and display it
zero_scores = student_assessment.loc[student_assessment['score'] == 0]
zero_scores.head()

Unnamed: 0,id_assessment,id_student,date_submitted,score
785,1754,2456480,123,0
4322,14984,554986,24,0
4730,14985,141823,46,0
5391,14985,542259,46,0
5509,14985,549078,48,0


In [12]:
md(f'''
The first thing to check would be whether there are students with a 0 for a score to see if the NaNs represent 0's.
We see here that there are {len(zero_scores)} records of assessments with a 0 score, so the NaNs are not necessarily 0's.
''')


The first thing to check would be whether there are students with a 0 for a score to see if the NaNs represent 0's.
We see here that there are 329 records of assessments with a 0 score, so the NaNs are not necessarily 0's.


In [13]:
# make a dataframe of all assessments with NaN scores
NaN_scores = student_assessment.loc[student_assessment['score'].isnull() == True]
NaN_scores.head()

Unnamed: 0,id_assessment,id_student,date_submitted,score
215,1752,721259,22,
937,1754,260355,127,
2364,1760,2606802,180,
3358,14984,186780,77,
3914,14984,531205,26,


Here is a dataframe of the assessments which are missing scores

In [14]:
# make a dataframe of students whose score is NaN from student info
#initiate dataframe to store students with NaN scores
students_w_NaN_scores = pd.DataFrame()

# iterate through NaN_scores
for index, row in NaN_scores.iterrows():
    # if student_id from NaN scores is found in student_info append that students information to a new dataframe students_w_NaN_scores
    students_w_NaN_scores = students_w_NaN_scores.append(student_info.loc[student_info['id_student'] == row['id_student']])

In [15]:
# display the new dataframe
students_w_NaN_scores.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,prev_attempts,studied_credits,disability,final_result
227,AAA,2013J,721259,F,South Region,Lower Than A Level,50-60%,55<=,0,120,False,Withdrawn
638,AAA,2014J,721259,F,South Region,Lower Than A Level,50-60%,55<=,1,60,False,Withdrawn
108,AAA,2013J,260355,F,London Region,A Level or Equivalent,80-90%,35-55,0,60,False,Withdrawn
466,AAA,2014J,260355,F,London Region,A Level or Equivalent,80-90%,35-55,1,120,False,Withdrawn
733,AAA,2014J,2606802,M,North Region,A Level or Equivalent,60-70%,0-35,0,60,False,Fail


This dataframe contains the students which are missing scores for their exams

In [16]:
# get the counts of each student result within the NaN scores dataframe
dataframe(students_w_NaN_scores['final_result'].value_counts())

Unnamed: 0,final_result
Withdrawn,104
Fail,82
Pass,40
Distinction,1


For students which withdrew or failed it makes sense that some of their test scores would be missing. For the passed students, it is possible that they still made it by without passing an exam. The student with distinction is of note. Let's check their record first.

In [17]:
# locate the student in students_w_NaN scores whose final_result was Distinction
students_w_NaN_scores.loc[students_w_NaN_scores['final_result'] == 'Distinction']

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,prev_attempts,studied_credits,disability,final_result
11401,CCC,2014J,571765,M,South East Region,No Formal quals,20-30%,0-35,0,60,False,Distinction


Above we have the student in question with ID 571765. Now let's see the rest of their test scores

In [18]:
# locate the other test scores of the student with Distinction
student_assessment.loc[student_assessment['id_student'] == '571765'].fillna(0)

Unnamed: 0,id_assessment,id_student,date_submitted,score
54125,24291,571765,31,86
55788,24292,571765,99,95
56257,24293,571765,156,97
57455,24294,571765,176,88
59287,24295,571765,21,78
60075,24296,571765,69,86
61964,24297,571765,147,92
63525,24298,571765,219,83
63995,24299,571765,245,98
75592,25348,571765,24,98


According to the data source, a score of 40 or less is interpreted as failure. This student receieved excellent marks on their exams aside from the NaN value which we have filled with a 0 here. It is very possible that this student still received distinction with a 0 on one exam. Also of note is that the exam was submitted late into the module, and possibly defaulted to a 0. Let's do another test case with the first student in the dataframe of students with NaN scores who still passed.

In [19]:
# locate the students who passed with NaN test scores
students_w_NaN_scores.loc[students_w_NaN_scores['final_result'] == 'Pass'].head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,prev_attempts,studied_credits,disability,final_result
1361,BBB,2013B,502717,F,Wales,Lower Than A Level,80-90%,0-35,1,240,False,Pass
1458,BBB,2013B,515497,F,London Region,A Level or Equivalent,0-10%,0-35,0,120,False,Pass
3059,BBB,2013J,501208,F,South East Region,Lower Than A Level,60-70%,35-55,1,60,False,Pass
2760,BBB,2013J,342685,F,Scotland,A Level or Equivalent,30-40%,0-35,0,60,True,Pass
3778,BBB,2013J,583473,M,South Region,Lower Than A Level,90-100%,35-55,0,120,False,Pass


Here is the dataframe of students with a NaN value test score who still passed.
We will check the scores of the first student, 502717.

In [20]:
# locate the test scores of the first passing student with NaN scores
student_assessment.loc[student_assessment['id_student'] == '502717'].fillna(0)

Unnamed: 0,id_assessment,id_student,date_submitted,score
3762,14984,502717,19,67
5116,14985,502717,46,76
6088,14986,502717,96,60
7187,14987,502717,138,77
8847,14989,502717,216,0
9899,14991,502717,60,100
11080,14992,502717,95,100
11959,14993,502717,126,80
13976,14995,502717,189,100


Here we can see the test scores of the student with 502717. Once again we see that it was possible for them to have passed with a 0, and the exam was submitted late into the module.

With this information we will fill the NA values with 0's under the assumption that these exams were not turned in.

In [21]:
# putting 0 for the NA scores in student_assessment
student_assessment = student_assessment.fillna(0)

**Duplicate Values**

In [22]:
# gives a dataframe of duplicate values if any
get_dupes(student_assessment)

There are no Duplicate Values

**Unique Value Counts**

In [23]:
# gives a dataframe of counts of unique values per column
count_unique(student_assessment)

index,Count
id_assessment,188
id_student,23369
date_submitted,312
score,101


In [24]:
assmnt_count = student_assessment['id_assessment'].nunique()
total_assmnts = assessments['id_assessment'].nunique()
md(f'''
* There are {assmnt_count} unique assessments that students took.
* This is less than the {total_assmnts} assessments we observed in the assessments dataframe meaning that there are some assessments on record that students did not take.
''')


* There are 188 unique assessments that students took.
* This is less than the 206 assessments we observed in the assessments dataframe meaning that there are some assessments on record that students did not take.


**Numerical Values**

In [26]:
student_assessment.describe().round(1)

Unnamed: 0,date_submitted,score
count,173912.0,173912.0
mean,116.0,75.7
std,71.5,18.9
min,-11.0,0.0
25%,51.0,65.0
50%,116.0,80.0
75%,173.0,90.0
max,608.0,100.0


In [59]:
mean_score = student_assessment['score'].mean().round(1)
date_max = student_assessment['date_submitted'].max()
date_min = student_assessment['date_submitted'].min()
max_course_length = courses['module_presentation_length'].max()
md(f'''
* The average test score is {mean_score} so most students are passing handily if 40 is considered a failing score.
* The minimum date_submitted is {date_min} so it is possible the students had access to the first exam early.
* The maximum date submitted is {date_max}, which is around 2.5 times longer than any course went on for.
* Let's check for records that are over the maximum course length of {max_course_length} days.
''')


* The average test score is 75.7 so most students are passing handily if 40 is considered a failing score.
* The minimum date_submitted is -11 so it is possible the students had access to the first exam early.
* The maximum date submitted is 608, which is around 2.5 times longer than any course went on for.
* Let's check for records that are over the maximum course length of 269 days.


In [46]:
late_tests = student_assessment.loc[student_assessment['date_submitted'] > 269].sort_values(by='date_submitted').reset_index(drop=True)
late_tests

Unnamed: 0,id_assessment,id_student,date_submitted,score
0,15022,1723749,270,0
1,30722,691701,274,0
2,25368,2341830,279,49
3,24299,555498,285,58
4,34879,595935,287,96
...,...,...,...,...
68,34881,325750,608,66
69,34880,325750,608,68
70,34878,325750,608,74
71,34882,325750,608,68


In [53]:
late_test_count = len(late_tests['date_submitted'])
late_test_min = late_tests['date_submitted'].min()
late_test_max = late_tests['date_submitted'].max()
late_test_avg = late_tests['date_submitted'].mean().round(1)
md(f'''
* There are {late_test_count} records of students handing in their exams well after the end of the module.
* These dates range from {late_test_min} days after the course began and {late_test_max} days after the course began with an average of {late_test_avg} days.
* The data source makes no mention of these, and they should not affect our analysis, so though strage, we will leave these records
''')


* There are 73 records of students handing in their exams well after the end of the module.
* These dates range from 270 days after the course began and 608 days after the course began with an average of 470.2 days.
* The data source makes no mention of these, and they should not affect our analysis, so though strage, we will leave these records


In [57]:
early_tests = student_assessment.loc[student_assessment['date_submitted'] < 0].sort_values('date_submitted').head(10)
early_tests

Unnamed: 0,id_assessment,id_student,date_submitted,score
28783,15008,559381,-11,63
76237,25348,2472145,-10,85
15866,14996,610700,-9,75
109487,34865,2389267,-8,84
110172,34865,539759,-8,98
104635,34860,514913,-8,66
110202,34865,542386,-8,94
110217,34865,543599,-8,100
110317,34865,550813,-8,88
110358,34865,553649,-8,95


In [75]:
student_assessment.loc[student_assessment['id_student'] == '559381']

Unnamed: 0,id_assessment,id_student,date_submitted,score
28783,15008,559381,-11,63
30301,15009,559381,38,75
31431,15010,559381,80,60
31985,15011,559381,107,80
33040,15012,559381,138,87
33877,15013,559381,190,75
34282,15015,559381,49,80
35631,15016,559381,84,80
36673,15017,559381,119,80
37187,15018,559381,154,100


In [81]:
for i in early_tests.iter:
    print()
    if student_assessment.loc[student_assessment['id_student'] == ['id_student']].head(1) == r:
        print(r)

i


ValueError: ('Lengths must match to compare', (173912,), (1,))

**Merged Assessment/Student_info dataframes**

In order to remove the students that we removed for the number of previous attempts, we must merge assessments and student info and find the difference

In [None]:
# merged 'student info/assessments' with a full outer join on their common columns
merged_si_assm = student_assessment.merge(student_info, how='outer', on=['id_student', 'code_module', 'code_presentation'], indicator=True)
merged_si_assm.head()

For this merge column the right side would be the student info dataframe and the left side would be assessments. If an entry receives the label of right_only there is a student who has no assessments, if the label is left_only, there is an assessment that doesn't match up with a student.

In [None]:
# variable for where merge is left_only, and only found on the 
only_assessments = merged_si_assm.loc[merged_si_assm['_merge']=='left_only']
only_student_info = merged_si_assm.loc[merged_si_assm['_merge']=='right_only']

**Assessments that do not map to students**:

In [None]:
only_assessments.head()

**Students without any test scores**:

In [None]:
only_student_info.head()

In [None]:
md(f'''
    We have {len(only_assessments)} values in only assessments, which map to students who had made previous attempts which we eliminated, and {len(only_student_info)} values in only student_info, which means we have students for whom we have no test scores.
    We can drop both of these which are missing values for the purpose of this dataframe since we are just analyzing test scores
    ''')

In [None]:
# merging assessments with the original student data dataframe to make sure that the missing students are the ones we removed.
merged_test = student_assessment.merge(student_info, how='outer', on=['id_student', 'code_module', 'code_presentation'], indicator=True)

# removing entries where num_prev_attempts == 0
merged_test = merged_test[merged_test['num_of_prev_attempts'] == 0]

# checking if any in only the student info dataframe remain (left_only). No output means all of the tests without students map to a student where num_prev_attempts == 0
merged_test.loc[merged_test['_merge']=='left_only']

In [None]:
# removing any student with NaN values in id_assessment or region
merged_si_assm = merged_si_assm.dropna(subset=['id_assessment', 'region'])

In [None]:
# reordering dataframe columns to group like data
merged_si_assm = merged_si_assm[['code_module', 'code_presentation', 'id_student', 'region', 'imd_band', 'age_band', 'gender', 'highest_education', 'disability', 'final_result', 'id_assessment', 'assessment_type', 'date_submitted', 'date', 'weight', 'score']]

In [None]:
# converting the data types back
merged_si_assm = merged_si_assm.astype({'id_assessment': int, 'id_student': int})
merged_si_assm = merged_si_assm.astype({'id_assessment': object, 'id_student': object})

In [None]:
# reset the index
merged_si_assm.reset_index(drop=True).head()

In [None]:
student_assessment = merged_si_assm

**Unique Counts**

In [None]:
student_assessment.nunique()

**Unique Categorical Values**

In [None]:
unique_vals(student_assessment)

**Duplicate Values:**

In [None]:
duplicate_vals(student_assessment)

**Statistics**

In [None]:
student_assessment.describe()