# Engagement

### Analyzing data from 2015/2016

* This will be our baseline
* This dataset was only used to train our models
* No notifications were sent to students that year

Importing libraries:

In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
font = { 'family': 'DejaVu Sans', 'weight': 'bold', 'size': 22 }
plt.rc('font', **font)

Data path:

In [3]:
data_path = '../data'

Grades:

In [4]:
df_grades = pd.read_json(os.path.join(data_path, 'grades.json'))

In [5]:
'{:,}'.format(len(df_grades))

'3,137'

In [6]:
### Gain Index

In [7]:
def gain_index(e1, e2):
    return float(e2 - e1) / e1

In [8]:
def normalized_gain_index(e1, e2):
    if e1 == 0 and e2 == 0:
        return 0
    elif e1 == 0:
        return 1
    gi = gain_index(e1, e2)
    if gi > 1:
        return 1
    return gi

## Course: CA117, Academic year: 2015/2016

In [9]:
_course = 'ca117'

In [10]:
_academic_year = (2015, 2016)

In [11]:
def get_grades(course, academic_year):
    return df_grades[(df_grades['module'] == course) & 
                     (df_grades['academic_year_0'] == academic_year[0]) &
                     (df_grades['academic_year_1'] == academic_year[1])].index

In [12]:
grades_ca117 = df_grades.iloc[get_grades(_course, _academic_year)]

In [13]:
len(grades_ca117)

298

In [14]:
student_names_ca117 = grades_ca117.user.unique()

In [15]:
len(student_names_ca117)

149

In [16]:
exam_weeks_ca117 = sorted(grades_ca117.exam_week.unique())

In [17]:
exam_weeks_ca117

[6, 12]

In [18]:
def get_improvement(student_names, exam_weeks, grades, debug=False):
    
    marks = {}
    normgi = {}
    diff = {}
    
    for student_name in student_names:
        
        prev = None
        improvement = None
        
        for exam_week in exam_weeks:
            
            grade = 0
            grade_index = grades[(grades['user'] == student_name) &
                                 (grades['exam_week'] == exam_week)].index
            if len(grade_index) > 0:
                grade = df_grades.iloc[grade_index[0]]['grade']
                
            marks.setdefault(exam_week, {})
            marks[exam_week][student_name] = grade
            
            if debug: print('Student: {}, Exam week: {}, Grade: {}'.format(student_name, exam_week, grade))
            
            if prev is not None:
                improvement = normalized_gain_index(prev, grade)
                if debug: print('Student: {}, Improvement: {:.2f}'.format(student_name, improvement))
                normgi[student_name] = improvement
                diff[student_name] = grade - prev
                
            prev = grade
            
    return normgi, diff, marks

In [19]:
normgi_ca117, diff_ca117, marks_ca117  = get_improvement(student_names_ca117, exam_weeks_ca117, grades_ca117)

In [20]:
THRESHOLD = 40

def get_pass_fail_cohorts(student_names, exam_weeks, grades):

    passing, failing = [], []
    
    first_exam = exam_weeks[0]

    for student_name in student_names:

        grade = 0
        grade_index = grades[(grades['user'] == student_name) &
                             (grades['exam_week'] == first_exam)].index
        if len(grade_index) > 0:
            grade = df_grades.iloc[grade_index[0]]['grade']

        if grade >= THRESHOLD:
            passing.append(student_name)
        else:
            failing.append(student_name)

    return passing, failing

In [21]:
passing_ca117, failing_ca117 = get_pass_fail_cohorts(student_names_ca117, exam_weeks_ca117, grades_ca117)

In [22]:
pass_fail_dict_ca117 = {
    'Pass': passing_ca117,
    'Fail': failing_ca117,
}

In [23]:
def evaluate(student_dict, normgi, diff, marks, exam_weeks):
    
    # Store improvements
    d = {}
    
    for name, students in student_dict.items():
        
        num_students = len(students)
        
        print('Group: {} (# {})'.format(name, num_students))
        
        for exam_week in exam_weeks:
            # Marks
            mrks = [ value for student_name, value in marks[exam_week].items() if student_name in students ]    
            # Avg
            mrks_avg = np.mean(mrks)
            # Std. deviation
            mrks_std = np.std(mrks)
            print('Marks Exam Week {}, Avg: {:.2f} ({:.2f})'.format(exam_week, mrks_avg, mrks_std))
        
        # Array of improvements
        improvements = [ value for student_name, value in normgi.items() if student_name in students ]
        d[name] = improvements
        
        # Avg
        improv_avg = np.mean(improvements)
        # Std. deviation
        improv_std = np.std(improvements)
        
        # Differences
        differences = [ value for student_name, value in diff.items() if student_name in students ]
        
        # Avg
        diff_avg = np.mean(differences)
        # Std. deviation
        diff_std = np.std(differences)
              
        print('* Improvement Avg: {:.2f} ({:.2f}), Diff Avg: {:.2f} ({:.2f})'.format(
            mrks_avg, mrks_std, improv_avg, improv_std, diff_avg, diff_std))
        
    # T-test
    keys = list(student_dict.keys())
    group_one = d[keys[0]]
    group_two = d[keys[1]]
    ttest = ttest_ind(group_one, group_two)
    print(ttest)


In [24]:
evaluate(pass_fail_dict_ca117, normgi_ca117, diff_ca117, marks_ca117, exam_weeks_ca117)

Group: Pass (# 66)
Marks Exam Week 6, Avg: 75.23 (20.08)
Marks Exam Week 12, Avg: 55.06 (29.94)
* Improvement Avg: 55.06 (29.94), Diff Avg: -0.28 (0.36)
Group: Fail (# 83)
Marks Exam Week 6, Avg: 14.70 (13.65)
Marks Exam Week 12, Avg: 24.40 (24.74)
* Improvement Avg: 24.40 (24.74), Diff Avg: 0.26 (0.71)
Ttest_indResult(statistic=-5.6492908116991485, pvalue=8.085760644466663e-08)


In [25]:
df_students_ca117 = pd.read_csv('../data/students-ca117-2016.csv')

In [26]:
def compare_characteristics(dic, df_students):

    for option in dic:
        
        # Students
        student_names = dic[option]
        
        # Group
        df_group = df_students[df_students['Username'].isin(student_names)]

        # Age
        age_mean = np.mean(df_group['Age'])
        
        # CAO Points
        points = df_group[df_group['Route'] == 'Leaving Cert.']['CAO Points']
        cao_mean = np.mean(points)
        
        print('{} - # students: {}, Age: {:.2f}, CAO ({}): {:.2f}'.format(
            option, len(df_group), age_mean, len(points), cao_mean))

In [27]:
compare_characteristics(pass_fail_dict_ca117, df_students_ca117)

Pass - # students: 66, Age: 18.77, CAO (49): 445.31
Fail - # students: 83, Age: 19.07, CAO (61): 430.08


## Course: CA114, Academic year: 2015/2016

In [28]:
_course = 'ca114'

In [29]:
_academic_year = (2015, 2016)

In [30]:
grades_ca114 = df_grades.iloc[get_grades(_course, _academic_year)]

In [31]:
len(grades_ca114)

143

In [32]:
student_names_ca114 = grades_ca114.user.unique()

In [33]:
len(student_names_ca114)

75

In [34]:
exam_weeks_ca114 = sorted(grades_ca114.exam_week.unique())

In [35]:
exam_weeks_ca114

[6, 12]

In [36]:
normgi_ca114, diff_ca114, marks_ca114 = get_improvement(student_names_ca114, exam_weeks_ca114, grades_ca114)

In [37]:
passing_ca114, failing_ca114 = get_pass_fail_cohorts(student_names_ca114, exam_weeks_ca114, grades_ca114)

In [38]:
pass_fail_dict_ca114 = {
    'Pass': passing_ca114,
    'Fail': failing_ca114,
}

In [39]:
evaluate(pass_fail_dict_ca114, normgi_ca114, diff_ca114, marks_ca114, exam_weeks_ca114)

Group: Pass (# 24)
Marks Exam Week 6, Avg: 64.17 (22.35)
Marks Exam Week 12, Avg: 64.17 (34.15)
* Improvement Avg: 64.17 (34.15), Diff Avg: 0.05 (0.59)
Group: Fail (# 51)
Marks Exam Week 6, Avg: 5.88 (9.11)
Marks Exam Week 12, Avg: 41.96 (31.75)
* Improvement Avg: 41.96 (31.75), Diff Avg: 0.65 (0.65)
Ttest_indResult(statistic=-3.744705693615501, pvalue=0.00035779664895052414)


In [40]:
df_students_ca114 = pd.read_csv('../data/students-ca114-2016.csv')

In [41]:
compare_characteristics(pass_fail_dict_ca114, df_students_ca114)

Pass - # students: 22, Age: 18.45, CAO (18): 387.50
Fail - # students: 49, Age: 18.31, CAO (42): 388.93


## Course: CA278, Academic year: 2015/2016

In [42]:
_course = 'ca278'

In [43]:
_academic_year = (2015, 2016)

In [44]:
grades_ca278 = df_grades.iloc[get_grades(_course, _academic_year)]

In [45]:
len(grades_ca278)

0

The course CA278 'Programming Fundamentals III' was taught for the first time in 2016/2017