# Engagement

### Analyzing data from 2017/2018

* Measuring whether students clicked on the resources suggested to them

Importing libraries:

In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
font = { 'family': 'DejaVu Sans', 'weight': 'bold', 'size': 22 }
plt.rc('font', **font)

Data path:

In [3]:
data_path = '../data'

Grades:

In [4]:
df_grades = pd.read_json(os.path.join(data_path, 'grades.json'))

In [5]:
len(df_grades)

3217

Web events:

In [6]:
df_web = pd.read_csv('../data/web_data.csv')

In [7]:
len(df_web)

25500773

In [8]:
df_web['date'] = pd.to_datetime(df_web['date'], format='%y%m%d-%H:%M.%S')

In [9]:
len(df_web)

25500773

### Gain Index

In [10]:
def gain_index(e1, e2):
    return float(e2 - e1) / e1

In [11]:
def normalized_gain_index(e1, e2):
    if e1 == 0 and e2 == 0:
        return 0
    elif e1 == 0:
        return 1
    gi = gain_index(e1, e2)
    if gi > 1:
        return 1
    return gi

## Course: CA117, Academic year: 2017/2018

In [12]:
_course = 'ca117'

In [13]:
_academic_year = (2017, 2018)

In [14]:
def get_grades(course, academic_year):
    return df_grades[(df_grades['module'] == course) & 
                     (df_grades['academic_year_0'] == academic_year[0]) &
                     (df_grades['academic_year_1'] == academic_year[1])].index

In [15]:
grades_ca117 = df_grades.iloc[get_grades(_course, _academic_year)]

In [16]:
len(grades_ca117)

296

In [17]:
student_names_ca117 = grades_ca117.user.unique()

In [18]:
len(student_names_ca117)

148

In [19]:
exam_weeks_ca117 = sorted(grades_ca117.exam_week.unique())

In [20]:
exam_weeks_ca117

[7, 12]

In [21]:
start = datetime(year=2017, month=9, day=1)
end = datetime(year=2018, month=5, day=1)

def get_web_events(course):
    return df_web[(df_web['module'] == course) & 
                  (df_web['date'] > start) &
                  (df_web['date'] < end) & 
                  (df_web['resource'].str.contains('predictcs=1'))].index

In [22]:
web_ca117 = df_web.iloc[get_web_events(_course)]

In [23]:
web_ca117.head()

Unnamed: 0,date,module,ip,resource,user
16294432,2018-03-20 18:57:39,ca117,129.219.21.2,/html/week06/lab02/02_lab.html?predictcs=1,dazcona
16294443,2018-03-20 18:57:40,ca117,129.219.21.2,/einstein/get-credentials?predictcs=1,dazcona
16294444,2018-03-20 18:57:40,ca117,129.219.21.2,/einstein/get-task-status?predictcs=1,dazcona
16294457,2018-03-20 18:57:49,ca117,129.219.21.2,/html/week02/lecture03/03_references.html?pred...,dazcona
16294473,2018-03-20 18:57:50,ca117,129.219.21.2,/einstein/get-credentials?predictcs=1,dazcona


In [24]:
len(web_ca117)

82

In [25]:
# web_ca117['user'].unique()

In [26]:
len(web_ca117['user'].unique())

20

In [27]:
clicked_ca117 = [ username for username in web_ca117['user'].unique() if username in student_names_ca117 ]

In [28]:
not_clicked_ca117 = [ username for username in student_names_ca117 if username not in clicked_ca117 ]

In [29]:
len(clicked_ca117), len(not_clicked_ca117)

(19, 129)

In [30]:
click_dict_ca117 = {
    'Clicked': clicked_ca117,
    'Did-not-click': not_clicked_ca117,
}

In [31]:
THRESHOLD = 40

def get_pass_fail_cohorts(student_names, exam_weeks, grades):

    passing, failing = [], []
    
    first_exam = exam_weeks[0]

    for student_name in student_names:

        grade = 0
        grade_index = grades[(grades['user'] == student_name) &
                             (grades['exam_week'] == first_exam)].index
        if len(grade_index) > 0:
            grade = df_grades.iloc[grade_index[0]]['grade']

        if grade >= THRESHOLD:
            passing.append(student_name)
        else:
            failing.append(student_name)

    return passing, failing

In [32]:
passing_ca117, failing_ca117 = get_pass_fail_cohorts(student_names_ca117, exam_weeks_ca117, grades_ca117)

In [33]:
pass_fail_dict_ca117 = {
    'Pass': passing_ca117,
    'Fail': failing_ca117,
}

In [34]:
def get_improvement(student_names, exam_weeks, grades, debug=False):
    
    marks = {}
    normgi = {}
    diff = {}
    
    for student_name in student_names:
        
        prev = None
        improvement = None
        
        for exam_week in exam_weeks:
            
            grade = 0
            grade_index = grades[(grades['user'] == student_name) &
                                 (grades['exam_week'] == exam_week)].index
            if len(grade_index) > 0:
                grade = df_grades.iloc[grade_index[0]]['grade']
                
            marks.setdefault(exam_week, {})
            marks[exam_week][student_name] = grade
            
            if debug: print('Student: {}, Exam week: {}, Grade: {}'.format(student_name, exam_week, grade))
            
            if prev is not None:
                improvement = normalized_gain_index(prev, grade)
                if debug: print('Student: {}, Improvement: {:.2f}'.format(student_name, improvement))
                normgi[student_name] = improvement
                diff[student_name] = grade - prev
                
            prev = grade
            
    return normgi, diff, marks

In [35]:
normgi_ca117, diff_ca117, marks_ca117 = get_improvement(student_names_ca117, exam_weeks_ca117, grades_ca117)

In [36]:
def evaluate(student_dict, normgi, diff, marks, exam_weeks):
    
    # Store improvements
    d = {}
    
    for name, students in student_dict.items():
        
        num_students = len(students)
        
        print('Group: {} (# {})'.format(name, num_students))
        
        for exam_week in exam_weeks:
            # Marks
            mrks = [ value for student_name, value in marks[exam_week].items() if student_name in students ]    
            # Avg
            mrks_avg = np.mean(mrks)
            # Std. deviation
            mrks_std = np.std(mrks)
            print('Marks Exam Week {}, Avg: {:.2f} ({:.2f})'.format(exam_week, mrks_avg, mrks_std))
        
        # Array of improvements
        improvements = [ value for student_name, value in normgi.items() if student_name in students ]
        d[name] = improvements
        
        # Avg
        improv_avg = np.mean(improvements)
        # Std. deviation
        improv_std = np.std(improvements)
        
        # Differences
        differences = [ value for student_name, value in diff.items() if student_name in students ]
        
        # Avg
        diff_avg = np.mean(differences)
        # Std. deviation
        diff_std = np.std(differences)
              
        print('* Improvement Avg: {:.2f} ({:.2f}), Diff Avg: {:.2f} ({:.2f})'.format(
            mrks_avg, mrks_std, improv_avg, improv_std, diff_avg, diff_std))
        
    # T-test
    keys = list(student_dict.keys())
    group_one = d[keys[0]]
    group_two = d[keys[1]]
    ttest = ttest_ind(group_one, group_two)
    print(ttest)


In [37]:
evaluate(pass_fail_dict_ca117, normgi_ca117, diff_ca117, marks_ca117, exam_weeks_ca117)

Group: Pass (# 90)
Marks Exam Week 7, Avg: 75.83 (20.90)
Marks Exam Week 12, Avg: 55.11 (28.53)
* Improvement Avg: 55.11 (28.53), Diff Avg: -0.28 (0.33)
Group: Fail (# 58)
Marks Exam Week 7, Avg: 12.93 (12.49)
Marks Exam Week 12, Avg: 25.00 (26.08)
* Improvement Avg: 25.00 (26.08), Diff Avg: 0.31 (0.67)
Ttest_indResult(statistic=-7.084959708965068, pvalue=5.471969072357101e-11)


In [38]:
evaluate(click_dict_ca117, normgi_ca117, diff_ca117, marks_ca117, exam_weeks_ca117)

Group: Clicked (# 19)
Marks Exam Week 7, Avg: 51.32 (39.30)
Marks Exam Week 12, Avg: 53.16 (24.72)
* Improvement Avg: 53.16 (24.72), Diff Avg: 0.22 (0.58)
Group: Did-not-click (# 129)
Marks Exam Week 7, Avg: 51.16 (35.06)
Marks Exam Week 12, Avg: 41.86 (31.86)
* Improvement Avg: 41.86 (31.86), Diff Avg: -0.09 (0.56)
Ttest_indResult(statistic=2.1795780020810502, pvalue=0.030891715742672465)


## Course: CA114, Academic year: 2017/2018

In [39]:
_course = 'ca114'

In [40]:
_academic_year = (2017, 2018)

In [41]:
grades_ca114 = df_grades.iloc[get_grades(_course, _academic_year)]

In [42]:
len(grades_ca114)

132

In [43]:
student_names_ca114 = grades_ca114.user.unique()

In [44]:
len(student_names_ca114)

69

In [45]:
exam_weeks_ca114 = sorted(grades_ca114.exam_week.unique())

In [46]:
exam_weeks_ca114

[6, 12]

In [47]:
web_ca114 = df_web.iloc[get_web_events(_course)]

In [48]:
len(web_ca114)

47

In [49]:
len(web_ca114['user'].unique())

6

In [50]:
clicked_ca114 = [ username for username in web_ca114['user'].unique() if username in student_names_ca114 ]

In [51]:
not_clicked_ca114 = [ username for username in student_names_ca114 if username not in clicked_ca114 ]

In [52]:
len(clicked_ca114), len(not_clicked_ca114)

(4, 65)

In [53]:
click_dict_ca114 = {
    'Clicked': clicked_ca114,
    'Did-not-click': not_clicked_ca114,
}

In [54]:
passing_ca114, failing_ca114 = get_pass_fail_cohorts(student_names_ca114, exam_weeks_ca114, grades_ca114)

In [55]:
pass_fail_dict_ca114 = {
    'Pass': passing_ca114,
    'Fail': failing_ca114,
}

In [56]:
normgi_ca114, diff_ca114, marks_ca114 = get_improvement(student_names_ca114, exam_weeks_ca114, grades_ca114)

In [57]:
evaluate(pass_fail_dict_ca114, normgi_ca114, diff_ca114, marks_ca114, exam_weeks_ca114)

Group: Pass (# 53)
Marks Exam Week 6, Avg: 83.02 (21.60)
Marks Exam Week 12, Avg: 79.62 (27.88)
* Improvement Avg: 79.62 (27.88), Diff Avg: 0.04 (0.47)
Group: Fail (# 16)
Marks Exam Week 6, Avg: 10.94 (12.40)
Marks Exam Week 12, Avg: 40.00 (36.74)
* Improvement Avg: 40.00 (36.74), Diff Avg: 0.42 (0.71)
Ttest_indResult(statistic=-2.505199313474727, pvalue=0.014675870144223974)


In [58]:
evaluate(click_dict_ca114, normgi_ca114, diff_ca114, marks_ca114, exam_weeks_ca114)

Group: Clicked (# 4)
Marks Exam Week 6, Avg: 50.00 (30.62)
Marks Exam Week 12, Avg: 90.00 (10.00)
* Improvement Avg: 90.00 (10.00), Diff Avg: 0.70 (0.52)
Group: Did-not-click (# 65)
Marks Exam Week 6, Avg: 67.31 (36.41)
Marks Exam Week 12, Avg: 69.23 (35.10)
* Improvement Avg: 69.23 (35.10), Diff Avg: 0.09 (0.55)
Ttest_indResult(statistic=2.1450540512839646, pvalue=0.03557755548452577)


## Course: CA278, Academic year: 2017/2018

In [59]:
_course = 'ca278'

In [60]:
_academic_year = (2017, 2018)

In [61]:
grades_ca278 = df_grades.iloc[get_grades(_course, _academic_year)]

In [62]:
len(grades_ca278)

160

In [63]:
student_names_ca278 = grades_ca278.user.unique()

In [64]:
len(student_names_ca278)

80

In [65]:
exam_weeks_ca278 = sorted(grades_ca278.exam_week.unique())

In [66]:
exam_weeks_ca278

[6, 12]

In [67]:
web_ca278 = df_web.iloc[get_web_events(_course)]

In [68]:
len(web_ca278)

24

In [69]:
len(web_ca278['user'].unique())

5

In [70]:
clicked_ca278 = [ username for username in web_ca278['user'].unique() if username in student_names_ca278 ]

In [71]:
not_clicked_ca278 = [ username for username in student_names_ca278 if username not in clicked_ca278 ]

In [72]:
len(clicked_ca278), len(not_clicked_ca278)

(5, 75)

In [73]:
click_dict_ca278 = {
    'Clicked': clicked_ca278,
    'Did-not-click': not_clicked_ca278,
}

In [74]:
passing_ca278, failing_ca278 = get_pass_fail_cohorts(student_names_ca278, exam_weeks_ca278, grades_ca278)

In [75]:
pass_fail_dict_ca278 = {
    'Pass': passing_ca278,
    'Fail': failing_ca278,
}

In [76]:
normgi_ca278, diff_ca278, marks_ca278 = get_improvement(student_names_ca278, exam_weeks_ca278, grades_ca278)

In [77]:
evaluate(pass_fail_dict_ca278, normgi_ca278, diff_ca278, marks_ca278, exam_weeks_ca278)

Group: Pass (# 70)
Marks Exam Week 6, Avg: 69.06 (16.34)
Marks Exam Week 12, Avg: 65.99 (17.24)
* Improvement Avg: 65.99 (17.24), Diff Avg: -0.01 (0.27)
Group: Fail (# 10)
Marks Exam Week 6, Avg: 31.90 (10.71)
Marks Exam Week 12, Avg: 50.00 (22.03)
* Improvement Avg: 50.00 (22.03), Diff Avg: 0.47 (0.37)
Ttest_indResult(statistic=-5.0117275577021765, pvalue=3.2916885808771503e-06)


In [78]:
evaluate(click_dict_ca278, normgi_ca278, diff_ca278, marks_ca278, exam_weeks_ca278)

Group: Clicked (# 5)
Marks Exam Week 6, Avg: 66.40 (21.14)
Marks Exam Week 12, Avg: 60.00 (17.99)
* Improvement Avg: 60.00 (17.99), Diff Avg: -0.08 (0.14)
Group: Did-not-click (# 75)
Marks Exam Week 6, Avg: 64.28 (19.89)
Marks Exam Week 12, Avg: 64.25 (18.68)
* Improvement Avg: 64.25 (18.68), Diff Avg: 0.05 (0.33)
Ttest_indResult(statistic=-0.8522952755177757, pvalue=0.39665963396981385)
