In [128]:
from ipynb.fs.full.Student_Info import student_info_reg
from ipynb.fs.full.Assessments import cleaned_assessments
from functions import *

@register_cell_magic
def markdown(line, cell):
    return md(cell.format(**globals()))
;

''

---

<h2>VLE and Student VLE Dataframes</h2>

---

<h3>VLE</h3>

The VLE dataframe contains information about materials available on the Virtual Learning Environment.

In [129]:
vle.head()

Unnamed: 0,id_site,code_module,code_presentation,activity_type,week_from,week_to
0,546943,AAA,2013J,resource,,
1,546712,AAA,2013J,oucontent,,
2,546998,AAA,2013J,resource,,
3,546888,AAA,2013J,url,,
4,547035,AAA,2013J,resource,,


---

<h4>VLE Contents</h4>

* <b>id_site</b>: The site ID is the unique identifier for the online resource.
* <b>code_module</b>: The code module is the module the resource is associated with.
* <b>code_presentation</b>: The code presentation represents the time the module was held at.
* <b>activity_type</b>: The activity type is the type of online material.
* <b>week_from</b>: The week from is the week the material was intended to be used from.
    - week_from will not be used in our analysis due to it being irrelevant information and will be dropped.
* <b>week_to</b>: The week to is the week the material was intended to be used until.
    - week_to will not be used in our analysis due to it being irrelevant information and will be dropped.

In [130]:
# dropping week_to and week_from from VLE dataframe
vle = vle.drop(columns=['week_from', 'week_to'])

---

<h4>Student VLE</h4>

The Student VLE Dataframe contains information about student interactions with the online resources in the Virtual Learning Environment.

In [131]:
student_vle.head()

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click
0,AAA,2013J,28400,546652,-10,4
1,AAA,2013J,28400,546652,-10,1
2,AAA,2013J,28400,546652,-10,1
3,AAA,2013J,28400,546614,-10,11
4,AAA,2013J,28400,546714,-10,1


---

<h4>Student VLE Contents</h4>

* <b>code_module</b>: The code module is the module the resource and student are associated with.
* <b>code_presentation</b>: The code presentation represents the time the module was held at.
* <b>id_site</b>: The site ID is the unique identifier for the online resource with which the student engaged.
* <b>date</b>: The date represents the date that the student engaged with the material relevant to the start date of the module.
* <b>sum_click</b>: The sum click represents the number of clicks the student made on that day.

```{note}
Since we are only interested in information that is pertinent to the student, we will be merging the VLE and Student VLE dataframes to have only the relevant information of each.
```

<h4>Merged VLE and Student VLE Dataframe</h4>

In [132]:
# merging vle & student vle with a full outer join on common columns
merged_vle = student_vle.merge(vle, how='outer', on=['id_site', 'code_module', 'code_presentation'],indicator=True)
merged_vle.head()

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click,activity_type,_merge
0,AAA,2013J,28400.0,546652,-10.0,4.0,forumng,both
1,AAA,2013J,28400.0,546652,-10.0,1.0,forumng,both
2,AAA,2013J,28400.0,546652,-10.0,1.0,forumng,both
3,AAA,2013J,28400.0,546652,-10.0,8.0,forumng,both
4,AAA,2013J,30268.0,546652,-10.0,3.0,forumng,both


The added merge column tells us if the data maps perfectly to both dataframes, or if it is only found on the right or left side, the right side in this case being the VLE dataframe and the left side being the student_VLE dataframe

In [133]:
# makes a dataframe containing only entries where _merge value is not both.
vle_only = merged_vle.loc[merged_vle['_merge'] != 'both']
vle_only.head()

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click,activity_type,_merge
10655280,AAA,2013J,,546897,,,url,right_only
10655281,AAA,2013J,,546872,,,subpage,right_only
10655282,AAA,2014J,,1032910,,,url,right_only
10655283,AAA,2014J,,1072237,,,url,right_only
10655284,AAA,2014J,,1027118,,,url,right_only


In [134]:
# checking the unique values of the dataframe. Only right_only
vle_only['_merge'].unique()

['right_only']
Categories (3, object): ['left_only', 'right_only', 'both']

In this case the data either maps perfectly to both or is only found on the right hand side, or the VLE dataframe. This represents materials which we have no student activity associated with which can be dropped along with the _merge column which will have no more interesting information.

In [135]:
# drop rows which have NaN values for id_student
merged_vle = merged_vle.dropna(subset=['id_student'])

# drop _merge column
merged_vle = merged_vle.drop(columns=['_merge'])

# reset index
merged_vle.reset_index(drop=True).head()

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click,activity_type
0,AAA,2013J,28400.0,546652,-10.0,4.0,forumng
1,AAA,2013J,28400.0,546652,-10.0,1.0,forumng
2,AAA,2013J,28400.0,546652,-10.0,1.0,forumng
3,AAA,2013J,28400.0,546652,-10.0,8.0,forumng
4,AAA,2013J,30268.0,546652,-10.0,3.0,forumng


<b>Aggregating Clicks</b>

* For this analysis we will only be using the sum of the students clicks throughout the course, and so we must add each days clicks per student.

<b>Number of activity types</b>

* We are going to remove activity_type for now. If sum_clicks overall ends up being a good predictor of how a student does, we will add it back.
* We will remove id_site for now since it does not add any information to the resource it maps to.

In [138]:
# removing activity_type and id_site columns
merged_vle = merged_vle.drop(columns=['activity_type', 'id_site'])

KeyError: "['activity_type' 'id_site'] not found in axis"

<b>VLE with clicks per student per module aggregated</b>

In [143]:
# gets sum click as total for the whol module. Removes date since no longer relevant.
aggregates = {'sum_click':'sum', 'code_module':'first', 'code_presentation':'first'}
merged_vle = merged_vle.groupby(['id_student']).aggregate(aggregates).reset_index()

# change id_student to int and then object to remove the .0
merged_vle = merged_vle.astype({'id_student': int})
merged_vle = merged_vle.astype({'id_student': object})

In [144]:
merged_vle = merged_vle[['code_module', 'code_presentation', 'id_student', 'sum_click']]

<b>Merge with Student Info Dataframe</b>

Finally, we will merge the merged VLE dataframe with the Student info dataframe to ensure wwe are only working with students who were not previously eliminated due to dropping out before the first day or for being on higher than their first attempt

<b>Merged VLE and Student Info Dataframes</b>

In [145]:
# outer merge of stud_info and vle dataframes on common columns
merged_vle_si = stud_info.merge(merged_vle, how='outer', on=['id_student', 'code_presentation', 'code_module'],indicator=True)

# show head of resulting dataframe
merged_vle_si

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,date_registration,date_unregistration,sum_click,_merge
0,AAA,2013J,11391.0,East Anglian Region,90-100%,55<=,M,HE Qualification,N,Pass,-159.0,,934.0,both
1,AAA,2013J,28400.0,Scotland,20-30%,35-55,F,HE Qualification,N,Pass,-53.0,,1435.0,both
2,AAA,2013J,30268.0,North Western Region,30-40%,35-55,F,A Level or Equivalent,Y,Withdrawn,-92.0,12.0,281.0,both
3,AAA,2013J,31604.0,South East Region,50-60%,35-55,F,A Level or Equivalent,N,Pass,-52.0,,2158.0,both
4,AAA,2013J,32885.0,West Midlands Region,50-60%,0-35,F,Lower Than A Level,N,Pass,-176.0,,1034.0,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28878,FFF,2013J,2694680,,,,,,,,,,48.0,right_only
28879,DDD,2014B,2696376,,,,,,,,,,282.0,right_only
28880,FFF,2013J,2697608,,,,,,,,,,26.0,right_only
28881,FFF,2014B,2697630,,,,,,,,,,1109.0,right_only


For the _merge column for this dataframe, left_only tells us that the data is only found in student info, and right only tells us the data is only found in VLE.

In [146]:
only_vle = merged_vle_si.loc[merged_vle_si['_merge'] == 'right_only']
only_vle.head()

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,date_registration,date_unregistration,sum_click,_merge
25760,BBB,2013B,23629,,,,,,,,,,161.0,right_only
25761,DDD,2014B,24213,,,,,,,,,,1992.0,right_only
25762,DDD,2014J,25572,,,,,,,,,,113.0,right_only
25763,BBB,2014B,25629,,,,,,,,,,16.0,right_only
25764,BBB,2014B,25997,,,,,,,,,,13.0,right_only


In [147]:
only_stud_info= merged_vle_si.loc[merged_vle_si['_merge'] == 'left_only']
only_stud_info.head()

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,date_registration,date_unregistration,sum_click,_merge
701,BBB,2013B,72070.0,South East Region,60-70%,35-55,M,A Level or Equivalent,N,Withdrawn,-24.0,10.0,,left_only
730,BBB,2013B,133531.0,Wales,30-40%,0-35,F,Lower Than A Level,N,Fail,-24.0,,,left_only
735,BBB,2013B,143854.0,West Midlands Region,10-20,35-55,F,Lower Than A Level,N,Withdrawn,-23.0,27.0,,left_only
800,BBB,2013B,322745.0,Scotland,90-100%,0-35,F,A Level or Equivalent,N,Fail,-85.0,,,left_only
802,BBB,2013B,323914.0,West Midlands Region,10-20,0-35,F,A Level or Equivalent,N,Fail,-136.0,,,left_only


In [148]:
md(f'''
    We have {len(only_vle)} values in only the merged vle, which map to students who had made previous attempts, and {len(only_stud_info)} values in only student_info, which means we have students for whom we have no click data.
    We can drop both of these which are missing values for the purpose of this dataframe since having no clicks gives us nothing to analyze.
    ''')


    We have 3123 values in only vle, which map to students who had made previous attempts, and 2809 values in only student_info, which means we have students for whom we have no click data.
    We can drop both of these which are missing values for the purpose of this dataframe since having no clicks gives us nothing to analyze.
    

In [149]:
# merging vle with the original student data dataframe to make sure that the missing students are the ones we removed.
merged_test = merged_vle.merge(student_info, how='outer', on=['id_student', 'code_module', 'code_presentation'], indicator=True)

# removing entries where num_prev_attempts == 0
merged_test = merged_test[merged_test['num_of_prev_attempts'] == 0]

# checking if any in only the student info dataframe remain (left_only). No output means all of the tests without students map to a student where num_prev_attempts == 0
merged_test.loc[merged_test['_merge']=='left_only']

Unnamed: 0,code_module,code_presentation,id_student,sum_click,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,_merge


In [165]:
# removing any entries where the region or sum click are NaN
merged_vle_si = merged_vle_si.dropna(subset=['region', 'sum_click'])

# reordering the data for clarity
cleaned_vle = merged_vle_si[['code_module', 'code_presentation',  'id_student', 'region', 'imd_band', 'age_band','gender','highest_education', 'disability', 'sum_click', 'final_result']]

In [166]:
cleaned_vle

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,sum_click,final_result
0,AAA,2013J,11391.0,East Anglian Region,90-100%,55<=,M,HE Qualification,N,934.0,Pass
1,AAA,2013J,28400.0,Scotland,20-30%,35-55,F,HE Qualification,N,1435.0,Pass
2,AAA,2013J,30268.0,North Western Region,30-40%,35-55,F,A Level or Equivalent,Y,281.0,Withdrawn
3,AAA,2013J,31604.0,South East Region,50-60%,35-55,F,A Level or Equivalent,N,2158.0,Pass
4,AAA,2013J,32885.0,West Midlands Region,50-60%,0-35,F,Lower Than A Level,N,1034.0,Pass
...,...,...,...,...,...,...,...,...,...,...,...
25755,GGG,2014J,2640965.0,Wales,10-20,0-35,F,Lower Than A Level,N,41.0,Fail
25756,GGG,2014J,2645731.0,East Anglian Region,40-50%,35-55,F,Lower Than A Level,N,893.0,Distinction
25757,GGG,2014J,2648187.0,South Region,20-30%,0-35,F,A Level or Equivalent,Y,312.0,Pass
25758,GGG,2014J,2679821.0,South East Region,90-100%,35-55,F,Lower Than A Level,N,275.0,Withdrawn


<b>Merge with Assessments Dataframe</b>

Finally we will be creating a merged dataframe of the the merged vle and student info and assessments dataframes. This is so that we can attempt to predict scores based on number of clicks.

In [162]:
merged_vle_assm = assessments_final.merge(merged_vle_si, how='outer', on=['code_module', 'code_presentation', 'id_student', 'region', 'imd_band', 'age_band', 'gender', 'highest_education', 'disability', 'final_result'],indicator=True)
merged_vle_assm

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,id_assessment,assessment_type,date_submitted,date,weight,score,sum_click,_merge
0,AAA,2013J,11391.0,East Anglian Region,90-100%,55<=,M,HE Qualification,N,Pass,1752,TMA,18.0,19.0,10.0,78.0,934.0,both
1,AAA,2013J,11391.0,East Anglian Region,90-100%,55<=,M,HE Qualification,N,Pass,1753,TMA,53.0,54.0,20.0,85.0,934.0,both
2,AAA,2013J,11391.0,East Anglian Region,90-100%,55<=,M,HE Qualification,N,Pass,1754,TMA,115.0,117.0,20.0,80.0,934.0,both
3,AAA,2013J,11391.0,East Anglian Region,90-100%,55<=,M,HE Qualification,N,Pass,1755,TMA,164.0,166.0,20.0,85.0,934.0,both
4,AAA,2013J,11391.0,East Anglian Region,90-100%,55<=,M,HE Qualification,N,Pass,1756,TMA,212.0,215.0,30.0,82.0,934.0,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155747,GGG,2014J,2282141.0,Wales,0-10%,35-55,M,A Level or Equivalent,N,Withdrawn,,,,,,,208.0,right_only
155748,GGG,2014J,2338614.0,Scotland,0-10%,35-55,F,A Level or Equivalent,Y,Withdrawn,,,,,,,51.0,right_only
155749,GGG,2014J,2475886.0,East Anglian Region,40-50%,35-55,F,Lower Than A Level,N,Fail,,,,,,,9.0,right_only
155750,GGG,2014J,2608143.0,East Midlands Region,60-70%,35-55,M,HE Qualification,N,Withdrawn,,,,,,,37.0,right_only


This dataframe is a full outer merge on our cleaned assessments dataframe and our cleaned VLE dataframe

In [160]:
merged_vle_assm.loc[merged_vle_assm['_merge'] == 'left_only']

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,id_assessment,assessment_type,date_submitted,date,weight,score,sum_click,_merge


In [161]:
merged_vle_assm.loc[merged_vle_assm['_merge'] == 'right_only']

Unnamed: 0,code_module,code_presentation,id_student,region,imd_band,age_band,gender,highest_education,disability,final_result,id_assessment,assessment_type,date_submitted,date,weight,score,sum_click,_merge


In [159]:
merged_vle_assm = merged_vle_assm.dropna(subset=['sum_click', 'id_assessment'])
merged_vle_assm = merged_vle_assm.drop(columns=['_merge'])

In [167]:
cleaned_assessments = merged_vle_assm[['code_module', 'code_presentation','id_student', 'region', 'imd_band', 'age_band', 'gender', 'highest_education', 'disability', 'sum_click', ]]

In [196]:
aggregates = {'score':'sum', 'code_module':'first', 'code_presentation':'first', 'weight': 'sum'}
score_test = cleaned_assessments.groupby(['id_student']).aggregate(aggregates).reset_index()
score_test.loc[score_test['weight']>=200]

Unnamed: 0,id_student,score,code_module,code_presentation,weight
3,23698.0,670.0,CCC,2014J,200.0
16,27116.0,840.0,CCC,2014J,200.0
23,28046.0,346.0,DDD,2013J,200.0
33,29411.0,533.0,CCC,2014J,211.0
35,29639.0,1089.0,CCC,2014J,300.0
...,...,...,...,...,...
23051,2693243.0,1253.0,DDD,2013B,200.0
23056,2694886.0,493.0,DDD,2014B,200.0
23058,2694933.0,1052.0,DDD,2013B,200.0
23060,2695608.0,556.0,DDD,2013J,200.0


<b>Data Types:</b>

In [197]:
student_info.loc[student_info['id_student'] ==8462.0]

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
14395,DDD,2013J,8462,M,London Region,HE Qualification,30-40%,55<=,0,90,N,Withdrawn
17560,DDD,2014J,8462,M,London Region,HE Qualification,30-40%,55<=,1,60,N,Withdrawn


In [179]:
student_assessment.loc[student_assessment['id_student'] ==8462.0]

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score
76301,25348,8462,29,0,93.0
76985,25349,8462,51,0,83.0
78452,25350,8462,85,0,87.0
88494,25362,8462,-1,1,93.0
90368,25363,8462,-1,1,83.0
90712,25364,8462,-1,1,83.0
92364,25365,8462,-1,1,87.0


In [182]:
assessments.loc[assessments['id_assessment'] == 25365]

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
105,DDD,2014J,25365,TMA,111.0,25.0


In [120]:
vle.dtypes

code_module           object
code_presentation     object
id_student            object
sum_click            float64
dtype: object

In [198]:
assessments

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
0,AAA,2013J,1752,TMA,19.0,10.0
1,AAA,2013J,1753,TMA,54.0,20.0
2,AAA,2013J,1754,TMA,117.0,20.0
3,AAA,2013J,1755,TMA,166.0,20.0
4,AAA,2013J,1756,TMA,215.0,30.0
...,...,...,...,...,...,...
201,GGG,2014J,37443,CMA,229.0,0.0
202,GGG,2014J,37435,TMA,61.0,0.0
203,GGG,2014J,37436,TMA,124.0,0.0
204,GGG,2014J,37437,TMA,173.0,0.0
