# Engagement

### Analyzing data from 2018/2019

* Measuring whether students talked to the chatbot or not

Importing libraries:

In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
font = { 'family': 'DejaVu Sans', 'weight': 'bold', 'size': 22 }
plt.rc('font', **font)

Data path:

In [3]:
data_path = '../data'

Grades:

In [4]:
df_grades = pd.read_json(os.path.join(data_path, 'grades.json'))

In [5]:
len(df_grades)

3217

### List of students that talked to the chatbot

In [6]:
from students import data

### Gain Index

In [7]:
def gain_index(e1, e2):
    return float(e2 - e1) / e1

In [8]:
def normalized_gain_index(e1, e2):
    if e1 == 0 and e2 == 0:
        return 0
    elif e1 == 0:
        return 1
    gi = gain_index(e1, e2)
    if gi > 1:
        return 1
    return gi

## Course: CA116, Academic year: 2018/2019

In [9]:
_course = 'ca116'

In [10]:
_academic_year = (2018, 2019)

In [11]:
def get_grades(course, academic_year):
    return df_grades[(df_grades['module'] == course) & 
                     (df_grades['academic_year_0'] == academic_year[0]) &
                     (df_grades['academic_year_1'] == academic_year[1])].index

In [12]:
grades_ca116 = df_grades.iloc[get_grades(_course, _academic_year)]

In [13]:
len(grades_ca116)

363

In [14]:
student_names_ca116 = grades_ca116.user.unique()

In [15]:
len(student_names_ca116)

132

In [16]:
exam_weeks_ca116 = sorted(grades_ca116.exam_week.unique())

In [17]:
exam_weeks_ca116

[4, 8, 12]

In [18]:
talked_to_bot_ca116 = data.CA116_2018_2019_STUDENTS_VIRTUAL_ASSISTANT

In [19]:
len(talked_to_bot_ca116)

52

In [20]:
not_talked_to_bot_ca116 = [student for student in student_names_ca116 if student not in talked_to_bot_ca116]

In [21]:
len(not_talked_to_bot_ca116)

80

In [22]:
def evaluate(student_dict, normgi, diff, marks, exam_weeks):
    
    # Store improvements
    d = {}
    
    for name, students in student_dict.items():
        
        num_students = len(students)
        
        print('Group: {} (# {})'.format(name, num_students))
        
        for exam_week in exam_weeks:
            # Marks
            mrks = [ value for student_name, value in marks[exam_week].items() if student_name in students ]    
            # Avg
            mrks_avg = np.mean(mrks)
            # Std. deviation
            mrks_std = np.std(mrks)
            print('Marks Exam Week {}, Avg: {:.2f} ({:.2f})'.format(exam_week, mrks_avg, mrks_std))
        
        # Array of improvements
        improvements = [ value for student_name, value in normgi.items() if student_name in students ]
        d[name] = improvements
        
        # Avg
        improv_avg = np.mean(improvements)
        # Std. deviation
        improv_std = np.std(improvements)
        
        # Differences
        differences = [ value for student_name, value in diff.items() if student_name in students ]
        
        # Avg
        diff_avg = np.mean(differences)
        # Std. deviation
        diff_std = np.std(differences)
              
        print('* Improvement Avg: {:.2f} ({:.2f}), Diff Avg: {:.2f} ({:.2f})'.format(
            mrks_avg, mrks_std, improv_avg, improv_std, diff_avg, diff_std))
        
    # T-test
    keys = list(student_dict.keys())
    group_one = d[keys[0]]
    group_two = d[keys[1]]
    ttest = ttest_ind(group_one, group_two)
    print(ttest)


In [23]:
bot_dict_ca116 = {
    'Talked': talked_to_bot_ca116,
    'Not-talked': not_talked_to_bot_ca116,
}

In [24]:
def get_improvement(student_names, exam_weeks, grades, debug=False):
    
    marks = {}
    normgi = {}
    diff = {}
    
    for student_name in student_names:
        
        prev = None
        improvement = None
        
        for exam_week in exam_weeks:
            
            grade = 0
            grade_index = grades[(grades['user'] == student_name) &
                                 (grades['exam_week'] == exam_week)].index
            if len(grade_index) > 0:
                grade = df_grades.iloc[grade_index[0]]['grade']
                
            marks.setdefault(exam_week, {})
            marks[exam_week][student_name] = grade
            
            if debug: print('Student: {}, Exam week: {}, Grade: {}'.format(student_name, exam_week, grade))
            
            if prev is not None:
                improvement = normalized_gain_index(prev, grade)
                if debug: print('Student: {}, Improvement: {:.2f}'.format(student_name, improvement))
                normgi[student_name] = improvement
                diff[student_name] = grade - prev
                
            prev = grade
            
    return normgi, diff, marks

In [25]:
normgi_ca116, diff_ca116, marks_ca116 = get_improvement(student_names_ca116, exam_weeks_ca116, grades_ca116)

In [26]:
def evaluate(student_dict, normgi, diff, marks, exam_weeks):
    
    # Store improvements
    d = {}
    
    for name, students in student_dict.items():
        
        num_students = len(students)
        
        print('Group: {} (# {})'.format(name, num_students))
        
        for exam_week in exam_weeks:
            # Marks
            mrks = [ value for student_name, value in marks[exam_week].items() if student_name in students ]    
            # Avg
            mrks_avg = np.mean(mrks)
            # Std. deviation
            mrks_std = np.std(mrks)
            print('Marks Exam Week {}, Avg: {:.2f} ({:.2f})'.format(exam_week, mrks_avg, mrks_std))
        
        # Array of improvements
        improvements = [ value for student_name, value in normgi.items() if student_name in students ]
        d[name] = improvements
        
        # Avg
        improv_avg = np.mean(improvements)
        # Std. deviation
        improv_std = np.std(improvements)
        
        # Differences
        differences = [ value for student_name, value in diff.items() if student_name in students ]
        
        # Avg
        diff_avg = np.mean(differences)
        # Std. deviation
        diff_std = np.std(differences)
              
        print('* Improvement Avg: {:.2f} ({:.2f}), Diff Avg: {:.2f} ({:.2f})'.format(
            mrks_avg, mrks_std, improv_avg, improv_std, diff_avg, diff_std))
        
    # T-test
    keys = list(student_dict.keys())
    group_one = d[keys[0]]
    group_two = d[keys[1]]
    ttest = ttest_ind(group_one, group_two)
    print(ttest)

In [27]:
evaluate(bot_dict_ca116, normgi_ca116, diff_ca116, marks_ca116, exam_weeks_ca116)

Group: Talked (# 52)
Marks Exam Week 4, Avg: 70.19 (31.79)
Marks Exam Week 8, Avg: 47.12 (35.58)
Marks Exam Week 12, Avg: 33.65 (34.63)
* Improvement Avg: 33.65 (34.63), Diff Avg: -0.18 (0.50)
Group: Not-talked (# 80)
Marks Exam Week 4, Avg: 72.50 (33.45)
Marks Exam Week 8, Avg: 45.62 (37.24)
Marks Exam Week 12, Avg: 32.50 (35.66)
* Improvement Avg: 32.50 (35.66), Diff Avg: -0.21 (0.60)
Ttest_indResult(statistic=0.27004486080036505, pvalue=0.7875539555896455)
