# Engagement

### Analyzing data from 2016/2017

* Introducing the Normalized Gain Index
* Measuring whether students fix their programs or not

Importing libraries:

In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
font = { 'family': 'DejaVu Sans', 'weight': 'bold', 'size': 22 }
plt.rc('font', **font)

Data path:

In [3]:
data_path = '../data'

Grades:

In [4]:
df_grades = pd.read_json(os.path.join(data_path, 'grades.json'))

In [5]:
'{:,}'.format(len(df_grades))

'3,217'

### List of students that fixed / did not fix their programs:

In [6]:
from students import data

### Gain Index

In [7]:
def gain_index(e1, e2):
    return float(e2 - e1) / e1

In [8]:
def normalized_gain_index(e1, e2):
    if e1 == 0 and e2 == 0:
        return 0
    elif e1 == 0:
        return 1
    gi = gain_index(e1, e2)
    if gi > 1:
        return 1
    return gi

In [9]:
normalized_gain_index(0, 100)

1

In [10]:
normalized_gain_index(100, 0)

-1.0

In [11]:
normalized_gain_index(0, 50)

1

In [12]:
normalized_gain_index(50, 60)

0.2

## Course: CA117, Academic year: 2016/2017

In [13]:
_course = 'ca117'

In [14]:
_academic_year = (2016, 2017)

In [15]:
def get_grades(course, academic_year):
    return df_grades[(df_grades['module'] == course) & 
                     (df_grades['academic_year_0'] == academic_year[0]) &
                     (df_grades['academic_year_1'] == academic_year[1])].index

In [16]:
grades_ca117 = df_grades.iloc[get_grades(_course, _academic_year)]

In [17]:
student_names_ca117 = grades_ca117.user.unique()

In [18]:
len(student_names_ca117)

140

In [19]:
exam_weeks_ca117 = sorted(grades_ca117.exam_week.unique())

In [20]:
exam_weeks_ca117

[6, 12]

In [21]:
# df_grades.iloc[2536]

In [22]:
def get_improvement(student_names, exam_weeks, grades, debug=False):
    
    marks = {}
    normgi = {}
    diff = {}
    
    for student_name in student_names:
        
        prev = None
        improvement = None
        
        for exam_week in exam_weeks:
            
            grade = 0
            grade_index = grades[(grades['user'] == student_name) &
                                 (grades['exam_week'] == exam_week)].index
            if len(grade_index) > 0:
                grade = df_grades.iloc[grade_index[0]]['grade']
                
            marks.setdefault(exam_week, {})
            marks[exam_week][student_name] = grade
            
            if debug: print('Student: {}, Exam week: {}, Grade: {}'.format(student_name, exam_week, grade))
            
            if prev is not None:
                improvement = normalized_gain_index(prev, grade)
                if debug: print('Student: {}, Improvement: {:.2f}'.format(student_name, improvement))
                normgi[student_name] = improvement
                diff[student_name] = grade - prev
                
            prev = grade
            
    return normgi, diff, marks

In [23]:
normgi_ca117, diff_ca117, marks_ca117  = get_improvement(student_names_ca117, exam_weeks_ca117, grades_ca117)

In [24]:
opt_dict_ca117 = {
    'Opted-IN': data.CA117_2016_2017_STUDENTS_OPTED_IN,
    'Opted-OUT': data.CA117_2016_2017_STUDENTS_OPTED_OUT,
}

In [25]:
fix_dict_ca117 = {
    'Fixed': data.CA117_2016_2017_STUDENTS_FIXED_ANY_PROGRAM,
    'Did-not-fix': data.CA117_2016_2017_STUDENTS_DID_NOT_FIX,
}

In [26]:
THRESHOLD = 40

def get_pass_fail_cohorts(student_names, exam_weeks, grades):

    passing, failing = [], []
    
    first_exam = exam_weeks[0]

    for student_name in student_names:

        grade = 0
        grade_index = grades[(grades['user'] == student_name) &
                             (grades['exam_week'] == first_exam)].index
        if len(grade_index) > 0:
            grade = df_grades.iloc[grade_index[0]]['grade']

        if grade >= THRESHOLD:
            passing.append(student_name)
        else:
            failing.append(student_name)

    return passing, failing

In [27]:
passing_ca117, failing_ca117 = get_pass_fail_cohorts(student_names_ca117, exam_weeks_ca117, grades_ca117)

In [28]:
pass_fail_dict_ca117 = {
    'Pass': passing_ca117,
    'Fail': failing_ca117,
}

In [29]:
def evaluate(student_dict, normgi, diff, marks, exam_weeks):
    
    # Store improvements
    d = {}
    
    for name, students in student_dict.items():
        
        num_students = len(students)
        
        print('Group: {} (# {})'.format(name, num_students))
        
        for exam_week in exam_weeks:
            # Marks
            mrks = [ value for student_name, value in marks[exam_week].items() if student_name in students ]    
            # Avg
            mrks_avg = np.mean(mrks)
            # Std. deviation
            mrks_std = np.std(mrks)
            print('Marks Exam Week {}, Avg: {:.2f} ({:.2f})'.format(exam_week, mrks_avg, mrks_std))
        
        # Array of improvements
        improvements = [ value for student_name, value in normgi.items() if student_name in students ]
        d[name] = improvements
        
        # Avg
        improv_avg = np.mean(improvements)
        # Std. deviation
        improv_std = np.std(improvements)
        
        # Differences
        differences = [ value for student_name, value in diff.items() if student_name in students ]
        
        # Avg
        diff_avg = np.mean(differences)
        # Std. deviation
        diff_std = np.std(differences)
              
        print('* Improvement Avg: {:.2f} ({:.2f}), Diff Avg: {:.2f} ({:.2f})'.format(
            mrks_avg, mrks_std, improv_avg, improv_std, diff_avg, diff_std))
        
    # T-test
    keys = list(student_dict.keys())
    group_one = d[keys[0]]
    group_two = d[keys[1]]
    ttest = ttest_ind(group_one, group_two)
    print(ttest)


In [30]:
evaluate(pass_fail_dict_ca117, normgi_ca117, diff_ca117, marks_ca117, exam_weeks_ca117)

Group: Pass (# 82)
Marks Exam Week 6, Avg: 76.22 (21.70)
Marks Exam Week 12, Avg: 47.85 (26.42)
* Improvement Avg: 47.85 (26.42), Diff Avg: -0.38 (0.30)
Group: Fail (# 58)
Marks Exam Week 6, Avg: 8.62 (11.88)
Marks Exam Week 12, Avg: 12.02 (14.64)
* Improvement Avg: 12.02 (14.64), Diff Avg: 0.18 (0.53)
Ttest_indResult(statistic=-7.90377188715549, pvalue=7.728688226495374e-13)


In [31]:
evaluate(opt_dict_ca117, normgi_ca117, diff_ca117, marks_ca117, exam_weeks_ca117)

Group: Opted-IN (# 122)
Marks Exam Week 6, Avg: 54.51 (36.36)
Marks Exam Week 12, Avg: 37.75 (27.42)
* Improvement Avg: 37.75 (27.42), Diff Avg: -0.15 (0.52)
Group: Opted-OUT (# 11)
Marks Exam Week 6, Avg: 52.27 (34.47)
Marks Exam Week 12, Avg: 19.45 (17.07)
* Improvement Avg: 19.45 (17.07), Diff Avg: -0.33 (0.68)
Ttest_indResult(statistic=1.0651208342711622, pvalue=0.2887804940758073)


In [32]:
evaluate(fix_dict_ca117, normgi_ca117, diff_ca117, marks_ca117, exam_weeks_ca117)

Group: Fixed (# 16)
Marks Exam Week 6, Avg: 32.81 (39.25)
Marks Exam Week 12, Avg: 27.62 (25.45)
* Improvement Avg: 27.62 (25.45), Diff Avg: 0.23 (0.68)
Group: Did-not-fix (# 53)
Marks Exam Week 6, Avg: 45.19 (31.79)
Marks Exam Week 12, Avg: 34.42 (27.46)
* Improvement Avg: 34.42 (27.46), Diff Avg: -0.14 (0.45)
Ttest_indResult(statistic=2.5215273559147287, pvalue=0.01410673108419503)


In [33]:
df_students_ca117 = pd.read_csv('../data/students-ca117-2017.csv')

In [34]:
def compare_characteristics(dic, df_students):

    for option in dic:
        
        # Students
        student_names = dic[option]
        
        # Group
        df_group = df_students[df_students['Username'].isin(student_names)]

        # Age
        age_mean = np.mean(df_group['Age'])
        
        # CAO Points
        points = df_group[df_group['Route'] == 'Leaving Cert.']['CAO Points']
        cao_mean = np.mean(points)
        
        print('{} - # students: {}, Age: {:.2f}, CAO ({}): {:.2f}'.format(
            option, len(df_group), age_mean, len(points), cao_mean))

In [35]:
compare_characteristics(pass_fail_dict_ca117, df_students_ca117)

Pass - # students: 81, Age: 18.88, CAO (62): 438.79
Fail - # students: 56, Age: 18.75, CAO (41): 399.15


In [36]:
compare_characteristics(opt_dict_ca117, df_students_ca117)

Opted-IN - # students: 119, Age: 18.82, CAO (91): 427.25
Opted-OUT - # students: 11, Age: 18.82, CAO (9): 446.67


In [37]:
compare_characteristics(fix_dict_ca117, df_students_ca117)

Fixed - # students: 16, Age: 18.62, CAO (12): 438.75
Did-not-fix - # students: 51, Age: 18.53, CAO (38): 398.16


## Course: CA114, Academic year: 2016/2017

In [38]:
_course = 'ca114'

In [39]:
_academic_year = (2016, 2017)

In [40]:
grades_ca114 = df_grades.iloc[get_grades(_course, _academic_year)]

In [41]:
len(grades_ca114)

136

In [42]:
student_names_ca114 = grades_ca114.user.unique()

In [43]:
len(student_names_ca114)

73

In [44]:
exam_weeks_ca114 = sorted(grades_ca114.exam_week.unique())

In [45]:
exam_weeks_ca114

[6, 12]

In [46]:
normgi_ca114, diff_ca114, marks_ca114 = get_improvement(student_names_ca114, exam_weeks_ca114, grades_ca114)

In [47]:
opt_dict_ca114 = {
    'Opted-IN': data.CA114_2016_2017_STUDENTS_OPTED_IN,
    'Opted-OUT': data.CA114_2016_2017_STUDENTS_OPTED_OUT,
}

In [48]:
fix_dict_ca114 = {
    'Fixed': data.CA114_2016_2017_STUDENTS_FIXED_ANY_PROGRAM,
    'Did-not-fix': data.CA114_2016_2017_STUDENTS_DID_NOT_FIX,
}

In [49]:
passing_ca114, failing_ca114 = get_pass_fail_cohorts(student_names_ca114, exam_weeks_ca114, grades_ca114)

In [50]:
pass_fail_dict_ca114 = {
    'Pass': passing_ca114,
    'Fail': failing_ca114,
}

In [51]:
evaluate(pass_fail_dict_ca114, normgi_ca114, diff_ca114, marks_ca114, exam_weeks_ca114)

Group: Pass (# 57)
Marks Exam Week 6, Avg: 72.81 (18.90)
Marks Exam Week 12, Avg: 55.44 (29.02)
* Improvement Avg: 55.44 (29.02), Diff Avg: -0.20 (0.46)
Group: Fail (# 16)
Marks Exam Week 6, Avg: 17.19 (11.59)
Marks Exam Week 12, Avg: 40.00 (29.15)
* Improvement Avg: 40.00 (29.15), Diff Avg: 0.35 (0.69)
Ttest_indResult(statistic=-3.7003152473786365, pvalue=0.0004216005544580895)


In [52]:
evaluate(opt_dict_ca114, normgi_ca114, diff_ca114, marks_ca114, exam_weeks_ca114)

Group: Opted-IN (# 63)
Marks Exam Week 6, Avg: 62.10 (28.31)
Marks Exam Week 12, Avg: 53.23 (29.17)
* Improvement Avg: 53.23 (29.17), Diff Avg: -0.11 (0.54)
Group: Opted-OUT (# 8)
Marks Exam Week 6, Avg: 57.14 (25.75)
Marks Exam Week 12, Avg: 42.86 (32.83)
* Improvement Avg: 42.86 (32.83), Diff Avg: -0.07 (0.74)
Ttest_indResult(statistic=-0.17503206243472802, pvalue=0.861582316613892)


In [53]:
evaluate(fix_dict_ca114, normgi_ca114, diff_ca114, marks_ca114, exam_weeks_ca114)

Group: Fixed (# 18)
Marks Exam Week 6, Avg: 62.50 (29.17)
Marks Exam Week 12, Avg: 56.67 (28.48)
* Improvement Avg: 56.67 (28.48), Diff Avg: -0.03 (0.51)
Group: Did-not-fix (# 35)
Marks Exam Week 6, Avg: 61.03 (22.84)
Marks Exam Week 12, Avg: 50.59 (29.99)
* Improvement Avg: 50.59 (29.99), Diff Avg: -0.15 (0.54)
Ttest_indResult(statistic=0.7505532156603627, pvalue=0.45643930507821384)


In [54]:
df_students_ca114 = pd.read_csv('../data/students-ca114-2017.csv')

In [55]:
compare_characteristics(pass_fail_dict_ca114, df_students_ca114)

Pass - # students: 56, Age: 18.34, CAO (47): 410.32
Fail - # students: 16, Age: 18.25, CAO (14): 396.79


In [56]:
compare_characteristics(opt_dict_ca114, df_students_ca114)

Opted-IN - # students: 62, Age: 18.31, CAO (51): 401.37
Opted-OUT - # students: 7, Age: 18.14, CAO (7): 457.14


In [57]:
compare_characteristics(fix_dict_ca114, df_students_ca114)

Fixed - # students: 18, Age: 18.44, CAO (13): 415.00
Did-not-fix - # students: 34, Age: 18.15, CAO (29): 405.86


## Course: CA278, Academic year: 2016/2017

In [58]:
_course = 'ca278'

In [59]:
_academic_year = (2016, 2017)

In [60]:
grades_ca278 = df_grades.iloc[get_grades(_course, _academic_year)]

In [61]:
len(grades_ca278)

114

In [62]:
student_names_ca278 = grades_ca278.user.unique()

In [63]:
len(student_names_ca278)

58

In [64]:
exam_weeks_ca278 = sorted(grades_ca278.exam_week.unique())

In [65]:
exam_weeks_ca278

[6, 12]

In [66]:
normgi_ca278, diff_ca278, marks_ca278 = get_improvement(student_names_ca278, exam_weeks_ca278, grades_ca278)

In [67]:
opt_dict_ca278 = {
    'Opted-IN': data.CA278_2016_2017_STUDENTS_OPTED_IN,
    'Opted-OUT': data.CA278_2016_2017_STUDENTS_OPTED_OUT,
}

In [68]:
fix_dict_ca278 = {
    'Fixed': data.CA278_2016_2017_STUDENTS_FIXED_ANY_PROGRAM,
    'Did-not-fix': data.CA278_2016_2017_STUDENTS_DID_NOT_FIX,
}

In [69]:
passing_ca278, failing_ca278 = get_pass_fail_cohorts(student_names_ca278, exam_weeks_ca278, grades_ca278)

In [70]:
pass_fail_dict_ca278 = {
    'Pass': passing_ca278,
    'Fail': failing_ca278,
}

In [71]:
evaluate(pass_fail_dict_ca278, normgi_ca278, diff_ca278, marks_ca278, exam_weeks_ca278)

Group: Pass (# 53)
Marks Exam Week 6, Avg: 62.57 (17.65)
Marks Exam Week 12, Avg: 73.91 (17.44)
* Improvement Avg: 73.91 (17.44), Diff Avg: 0.23 (0.36)
Group: Fail (# 5)
Marks Exam Week 6, Avg: 15.80 (12.94)
Marks Exam Week 12, Avg: 64.80 (8.13)
* Improvement Avg: 64.80 (8.13), Diff Avg: 0.96 (0.09)
Ttest_indResult(statistic=-4.365674244611575, pvalue=5.517289121006426e-05)


In [72]:
evaluate(opt_dict_ca278, normgi_ca278, diff_ca278, marks_ca278, exam_weeks_ca278)

Group: Opted-IN (# 42)
Marks Exam Week 6, Avg: 60.88 (23.24)
Marks Exam Week 12, Avg: 76.45 (14.91)
* Improvement Avg: 76.45 (14.91), Diff Avg: 0.31 (0.41)
Group: Opted-OUT (# 9)
Marks Exam Week 6, Avg: 58.12 (19.81)
Marks Exam Week 12, Avg: 71.38 (11.79)
* Improvement Avg: 71.38 (11.79), Diff Avg: 0.32 (0.32)
Ttest_indResult(statistic=-0.1107725427370544, pvalue=0.9122586020428451)


In [73]:
evaluate(fix_dict_ca278, normgi_ca278, diff_ca278, marks_ca278, exam_weeks_ca278)

Group: Fixed (# 7)
Marks Exam Week 6, Avg: 53.57 (27.49)
Marks Exam Week 12, Avg: 73.00 (14.90)
* Improvement Avg: 73.00 (14.90), Diff Avg: 0.41 (0.50)
Group: Did-not-fix (# 28)
Marks Exam Week 6, Avg: 58.25 (20.88)
Marks Exam Week 12, Avg: 73.93 (14.90)
* Improvement Avg: 73.93 (14.90), Diff Avg: 0.30 (0.39)
Ttest_indResult(statistic=0.6341914555286028, pvalue=0.5303279257047521)


In [74]:
df_students_ca278 = pd.read_csv('../data/students-ca278-2017.csv')

In [75]:
compare_characteristics(pass_fail_dict_ca278, df_students_ca278)

Pass - # students: 52, Age: 18.13, CAO (42): 411.79
Fail - # students: 5, Age: 18.00, CAO (5): 404.00


In [76]:
compare_characteristics(opt_dict_ca278, df_students_ca278)

Opted-IN - # students: 41, Age: 18.24, CAO (36): 414.17
Opted-OUT - # students: 9, Age: 17.78, CAO (6): 393.33


In [77]:
compare_characteristics(fix_dict_ca278, df_students_ca278)

Fixed - # students: 7, Age: 18.29, CAO (4): 450.00
Did-not-fix - # students: 27, Age: 18.30, CAO (25): 403.20


## Course: CA116, Academic year: 2016/2017

In [78]:
_course = 'ca116'

In [79]:
_academic_year = (2016, 2017)

In [80]:
grades_ca116 = df_grades.iloc[get_grades(_course, _academic_year)]

In [81]:
len(grades_ca116)

327

In [82]:
student_names_ca116 = grades_ca116.user.unique()

In [83]:
len(student_names_ca116)

126

In [84]:
exam_weeks_ca116 = sorted(grades_ca116.exam_week.unique())

In [85]:
exam_weeks_ca116

[4, 8, 12]

In [86]:
normgi_ca116, diff_ca116, marks_ca116 = get_improvement(student_names_ca116, exam_weeks_ca116, grades_ca116)

In [87]:
passing_ca116, failing_ca116 = get_pass_fail_cohorts(student_names_ca116, exam_weeks_ca116, grades_ca116)

In [88]:
pass_fail_dict_ca116 = {
    'Pass': passing_ca116,
    'Fail': failing_ca116,
}

In [89]:
evaluate(pass_fail_dict_ca116, normgi_ca116, diff_ca116, marks_ca116, exam_weeks_ca116)

Group: Pass (# 102)
Marks Exam Week 4, Avg: 86.27 (20.90)
Marks Exam Week 8, Avg: 47.11 (36.64)
Marks Exam Week 12, Avg: 47.10 (35.57)
* Improvement Avg: 47.10 (35.57), Diff Avg: 0.11 (0.57)
Group: Fail (# 24)
Marks Exam Week 4, Avg: 9.38 (12.10)
Marks Exam Week 8, Avg: 22.08 (16.89)
Marks Exam Week 12, Avg: 23.29 (20.62)
* Improvement Avg: 23.29 (20.62), Diff Avg: 0.03 (0.76)
Ttest_indResult(statistic=0.5625437082255612, pvalue=0.5747612370979556)


In [90]:
df_students_ca116 = pd.read_csv('../data/students-ca116-2017.csv')

In [91]:
compare_characteristics(pass_fail_dict_ca116, df_students_ca116)

Pass - # students: 99, Age: 18.89, CAO (75): 427.73
Fail - # students: 22, Age: 18.64, CAO (17): 402.35
