In [19]:
from functions import *

# Assessments

The assessments dataframe contains information about the unique assessments in each code module and presentation.

In [2]:
assessments.head()

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
0,AAA,2013J,1752,TMA,19.0,10.0
1,AAA,2013J,1753,TMA,54.0,20.0
2,AAA,2013J,1754,TMA,117.0,20.0
3,AAA,2013J,1755,TMA,166.0,20.0
4,AAA,2013J,1756,TMA,215.0,30.0


---

## Assessments Contents

* **code_module**: The code module represents the code name of the course the assessment was held for.
* **code_presentation**: The presentation represents the presentation which the test was held for.
* **id_assessment**: The assessment ID is the unique identifier for each assessment.
* **assessment_type**: The assessment type represents the kind of assessment it was.
    - There are three assessment types:
        * TMA: Tutor Marked Assessment
        * CMA: Computer Marked Assessment
        * Exam: The Final Exam
* **date**: The date is how many days from the start of the course the assessment took place
* **weight**: The weight is the weighted value of the assessment. Exams should have a weight of 100 which the rest of the assessments should add to 100 in total.

---

## Assessments Information

**Size**

In [3]:
get_size(assessments)

Unnamed: 0,Count
Columns,6
Rows,206


In [4]:
md(f'''
Assessments has {len(assessments.columns)} columns and {len(assessments)} rows representing unique exams.
''')


Assessments has 6 columns and 206 rows representing unique exams.


**Data Types**

In [5]:
get_dtypes(assessments)

index,Type
code_module,object
code_presentation,object
id_assessment,int64
assessment_type,object
date,float64
weight,float64


* `id_assessments` are is a categorical value and so should be converted to `string`
* `object` types should be converted to strings
* Both of the `float64` typed variables are whole numbers and should be converted to `int64`

In [16]:
# converting the data types
assessments['id_assessment'] = assessments['id_assessment'].astype(str)
assessments = assessments.convert_dtypes(convert_integer=False)

**Null Values**

In [7]:
# prints the sum of a columns null value
null_vals(assessments)

index,Null Values
code_module,0
code_presentation,0
id_assessment,0
assessment_type,0
date,11
weight,0


In [8]:
md(f'''
* We have {assessments['date'].isnull().sum()} null data points for assessment date. 
* The documentation of this dataset states that if the exam date is missing then it is as the end of the last presentation week. 
* We can find this information in the courses dataframe, and add them in to get rid of the NaNs.
''')


* We have 11 null data points for assessment date. 
* The documentation of this dataset states that if the exam date is missing then it is as the end of the last presentation week. 
* We can find this information in the courses dataframe, and add them in to get rid of the NaNs.


In [9]:
# adding the dates for the null test dates
for index, row in assessments[assessments['date'].isna()].iterrows():
    assessments.at[index, 'date'] = courses.loc[(courses['code_module'] == row['code_module']) & (courses['code_presentation'] == row['code_presentation']), 'module_presentation_length']

# reprinting to ensure it worked
dataframe(assessments.isnull().sum(), columns=['Null Values'])

Unnamed: 0,Null Values
code_module,0
code_presentation,0
id_assessment,0
assessment_type,0
date,0
weight,0


**Unique Counts**

In [10]:
count_unique(assessments)

index,Count
code_module,7
code_presentation,4
id_assessment,206
assessment_type,3
date,78
weight,24


In [11]:
md(f'''
There are {assessments['id_assessment'].nunique()} unique assessment ID's
''')


There are 206 unique assessment ID's


**Unique Categorical Values**

In [12]:
unique_vals(assessments)

index,Values
code_module,"['AAA', 'BBB', 'CCC', 'DDD', 'EEE', 'FFF', 'GGG']"
code_presentation,"['2013J', '2014J', '2013B', '2014B']"
assessment_type,"['TMA', 'Exam', 'CMA']"


Everything here is as we would expect in the data's description

In [21]:
dataframe(assessments['assessment_type'].value_counts())

Unnamed: 0,assessment_type
TMA,106
CMA,76
Exam,24


In [30]:
TMA_count = assessments['assessment_type'].value_counts()[0]
CMA_count = assessments['assessment_type'].value_counts()[1]
exam_count = assessments['assessment_type'].value_counts()[2]
md(f'''
There are {TMA_count} Tutor Marked Assessements, {CMA_count} Computer Marked Assessments and {exam_count} final exams in our data.
''')


There are 106 Tutor Marked Assessements, 76 Computer Marked Assessments and 24 final exams in our data.


In [None]:
pd.pivot_table(assessments[assessments['weight'] != 100.0], index=['code_presentation', 'code_module'], values='weight', aggfunc=np.sum)

In [None]:
md('''
Our data tells us that final exams are weighted 100 and the weights of the rest of the exams in a module should amount to 100.
''')

In [44]:
pd.pivot_table(assessments, index=['code_presentation', 'code_module'], values='weight', aggfunc=np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,weight
code_presentation,code_module,Unnamed: 2_level_1
2013B,BBB,200.0
2013B,DDD,200.0
2013B,FFF,200.0
2013J,AAA,200.0
2013J,BBB,200.0
2013J,DDD,200.0
2013J,EEE,200.0
2013J,FFF,200.0
2013J,GGG,100.0
2014B,BBB,200.0


This pivot table shows the module presentation, the module and the total of the weights of the exams in the module.
Because the final exam is weighted 100 and the other exams should form another 100 we should only have 200 points in each module. We see here that CCC modules 300 in total weight and GGG modules have 100 in total weight.

In [60]:
assessments.loc[(assessments['code_module'] == 'CCC') | (assessments['code_module'] == 'GGG')]

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
54,CCC,2014B,24286,CMA,18.0,2.0
55,CCC,2014B,24287,CMA,67.0,7.0
56,CCC,2014B,24288,CMA,137.0,8.0
57,CCC,2014B,24289,CMA,207.0,8.0
58,CCC,2014B,24282,TMA,32.0,9.0
59,CCC,2014B,24283,TMA,102.0,22.0
60,CCC,2014B,24284,TMA,151.0,22.0
61,CCC,2014B,24285,TMA,200.0,22.0
62,CCC,2014B,24290,Exam,,100.0
63,CCC,2014B,40087,Exam,,100.0


We can see here that CCC Modules had two final exams, and the GGG modules consisted only of a final exam. 
In the student_assessment dataframe we find that most students are missing final exam scores. In order to normalize our data, and make the scores roughly

In [None]:
pd.pivot_table(assessments[assessments['weight'] != 100.0], index=['code_presentation', 'code_module'], values='weight', aggfunc=np.sum)

In [41]:
assessments.loc[assessments['code_module']== 'GGG']

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
176,GGG,2013J,37418,CMA,229.0,0.0
177,GGG,2013J,37419,CMA,229.0,0.0
178,GGG,2013J,37420,CMA,229.0,0.0
179,GGG,2013J,37421,CMA,229.0,0.0
180,GGG,2013J,37422,CMA,229.0,0.0
181,GGG,2013J,37423,CMA,229.0,0.0
182,GGG,2013J,37415,TMA,61.0,0.0
183,GGG,2013J,37416,TMA,124.0,0.0
184,GGG,2013J,37417,TMA,173.0,0.0
185,GGG,2013J,37424,Exam,229.0,100.0


**Duplicate Values:**

In [13]:
get_dupes(assessments)

There are no Duplicate Values

**Numerical Values**

In [17]:
assessments.describe().round(2)

Unnamed: 0,date,weight
count,206.0,206.0
mean,150.97,20.87
std,78.16,30.38
min,12.0,0.0
25%,81.25,0.0
50%,159.0,12.5
75%,227.0,24.25
max,269.0,100.0
