In [1]:
from functions import *

# VLE
---
## VLE
---

The VLE dataframe contains information about materials available on the Virtual Learning Environment.

In [26]:
# show first 5 of vle dataframe
vle.head()

Unnamed: 0,id_site,code_module,code_presentation,activity_type
0,546943,AAA,2013J,resource
1,546712,AAA,2013J,oucontent
2,546998,AAA,2013J,resource
3,546888,AAA,2013J,url
4,547035,AAA,2013J,resource


---

#### VLE Information

**Size**

In [2]:
## return a dataframe of column and row count
get_size(vle)

Unnamed: 0,Count
Columns,6
Rows,6364


In [3]:
## store the size of vle's columns
vle_cols = len(vle.columns)
## store the size of vle's rows
vle_rows = len(vle)
md(f'''
VLE has {vle_cols} features and {"{:,}".format(vle_rows)} rows which represent the online resources we have data for.
''')


VLE has 6 columns and 6,364 rows which represent the online resources we have data for.


---

#### VLE Contents

* **id_site**: The site ID is the unique identifier for the online resource.
* **code_module**: The code module is the module the resource is associated with.
* **code_presentation**: The code presentation represents the time the module was held at.
* **activity_type**: The activity type is the type of online material.
* **week_from**: The week from is the week the material was intended to be used from.
    - week_from will not be used in our analysis due to it being irrelevant information and will be dropped.
* **week_to**: The week to is the week the material was intended to be used until.
    - week_to will not be used in our analysis due to it being irrelevant information and will be dropped.

In [3]:
## dropping week_to and week_from from VLE dataframe
vle = vle.drop(columns=['week_from', 'week_to'])

# Student VLE
---

The Student VLE Dataframe contains information about student interactions with the online resources in the Virtual Learning Environment

In [5]:
student_vle.head()

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click
0,AAA,2013J,28400,546652,-10,4
1,AAA,2013J,28400,546652,-10,1
2,AAA,2013J,28400,546652,-10,1
3,AAA,2013J,28400,546614,-10,11
4,AAA,2013J,28400,546714,-10,1


**Size**

In [3]:
get_size(student_vle)

Unnamed: 0,Count
Columns,6
Rows,10655280


In [6]:
# store the size of student_vle's columns
stud_vle_cols = len(student_vle.columns)
# store the size of student_vle's rows
stud_vle_rows = len(student_vle)
md(f'''
Student VLE has {stud_vle_cols} features and {"{:,}".format(stud_vle_rows)} rows which represent the student resource interactions we have data for.
''')


Student VLE has 6 features and 10,655,280 rows which represent the student resource interactions we have data for.


---

## Student VLE Contents

* **code_module**: The code module is the module the resource and student are associated with.
* **code_presentation**: The code presentation represents the time the module was held at.
* **id_site**: The site ID is the unique identifier for the online resource with which the student engaged.
* **date**: The date represents the date that the student engaged with the material relevant to the start date of the module.
* **sum_click**: The sum click represents the number of clicks the student made on that day.

---

## VLE Information

**Data Types**

In [7]:
# return a dataframe of vle's columns' data types
get_dtypes(vle)

index,Type
id_site,int64
code_module,object
code_presentation,object
activity_type,object
week_from,float64
week_to,float64


* `id_site` is categorical and will need to be changed into a `string`
* `object` datatypes will again be turned into strings

In [8]:
# change id_site to string type
vle['id_site'] = vle['id_site'].astype(str)
# convert all other types to ideal typings for pandas
vle = vle.convert_dtypes()
# show new datatypes dataframe
get_dtypes(vle)

index,Type
id_site,string
code_module,string
code_presentation,string
activity_type,string
week_from,Int64
week_to,Int64


**Null Values**

In [9]:
# return a dataframe of null values if any
null_vals(vle)

index,Null Values
id_site,0
code_module,0
code_presentation,0
activity_type,0
week_from,5243
week_to,5243


**Duplicate Values**

In [10]:
# return a dataframe of duplicate rows if any
get_dupes(vle)

There are no Duplicate Values

**Unique Value Counts**

In [11]:
# return a dataframe of counts of unique values per column
count_unique(vle)

index,Count
id_site,6364
code_module,7
code_presentation,4
activity_type,20
week_from,30
week_to,30


**Unique Categorical Values**

In [12]:
# return a dataframe of unique categorical variables' values
unique_vals(vle)

index,Values
code_module,"['AAA', 'BBB', 'CCC', 'DDD', 'EEE', 'FFF', 'GGG']"
code_presentation,"['2013J', '2014J', '2013B', '2014B']"
activity_type,"['resource', 'oucontent', 'url', 'homepage', 'subpage', 'glossary', 'forumng', 'oucollaborate', 'dataplus', 'quiz', 'ouelluminate', 'sharedsubpage', 'questionnaire', 'page', 'externalquiz', 'ouwiki', 'dualpane', 'repeatactivity', 'folder', 'htmlactivity']"


---

## Student VLE Information

**Data Types**

In [5]:
get_dtypes(student_vle)

index,Type
code_module,object
code_presentation,object
id_student,int64
id_site,int64
date,int64
sum_click,int64


In [6]:
student_vle = student_vle.astype({'id_site':str, 'id_student':str})
student_vle = student_vle.convert_dtypes(convert_integer=False)

**Null Values**

In [7]:
null_vals(student_vle)

index,Null Values
code_module,0
code_presentation,0
id_student,0
id_site,0
date,0
sum_click,0


**Duplicate Values**

In [8]:
duplicates = get_dupes(student_vle)
duplicates.head()

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click
2,AAA,2013J,28400,546652,-10,1
63,AAA,2013J,45462,546652,-10,1
180,AAA,2013J,77367,546652,-10,4
193,AAA,2013J,94961,546652,-10,2
442,AAA,2013J,248270,546652,-10,4


In [13]:
md(f'''
There are {"{:,}".format(len(duplicates))} duplicated rows which are merely the same students on the same day, interacting with the same material in the same way, which is to be expected.
''')


There are 787,170 duplicated rows which are merely the same students on the same day, interacting with the same material in the same way, which is to be expected.


**Unique Value Counts**

In [10]:
count_unique(student_vle)

index,Count
code_module,7
code_presentation,4
id_student,26074
id_site,6268
date,295
sum_click,498


In [15]:
unique_students = student_vle['id_student'].nunique()
si_unique_students = student_info['id_student'].nunique()
md(f'''
There are {"{:,}".format(unique_students)} in the student_vle out of the {"{:,}".format(si_unique_students)} 
students we have in student info. So {"{:,}".format(si_unique_students - unique_students)} students from student info do not have online interaction data.
''')


There are 26,074 in the student_vle out of the 28,785 
students we have in student info. So 2,711 students from student info do not have online interaction data.


**Numerical Values**

In [11]:
student_vle.describe().round(1)

Unnamed: 0,date,sum_click
count,10655280.0,10655280.0
mean,95.2,3.7
std,76.1,8.8
min,-25.0,1.0
25%,25.0,1.0
50%,86.0,2.0
75%,156.0,3.0
max,269.0,6977.0
