In [2]:
import pandas as pd
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

from scripts import utils

In [3]:
courses_cg = [202, 250]
courses_ee = [251, 424, 551]
courses = courses_cg + courses_ee
semester = "F2025"

Open all relevant files

In [4]:
data_dir = os.path.join(project_root, 'data')
course_data_dir = os.path.join(data_dir, 'student_emails_and_ids')

ee_file = "F2025_ee_clean.csv"
cg_file = "F2025_cg_clean.csv"
courses_files = {}
for course in courses:
    courses_files[course] = f"{semester}_COMP{course}.csv"

ee_data_path = os.path.join(data_dir, 'clean', ee_file)
cg_data_path = os.path.join(data_dir, 'clean', cg_file)
courses_data_path = {}
for course in courses:
    courses_data_path[course] = os.path.join(course_data_dir, courses_files[course])

Open the cleaned data files

In [5]:
ee_data = pd.read_csv(ee_data_path, header=[0,1], index_col=0)
cg_data = pd.read_csv(cg_data_path, header=[0,1], index_col=0)
data_all = [ee_data, cg_data]

for d in data_all:
    utils.rebuild_multiindex(d)

Open the course data file and extract IDs

In [6]:
student_ids = {}
for course in courses:
    course_data = pd.read_csv(courses_data_path[course])
    ids = course_data.OrgDefinedId.str.strip("#")
    student_ids[course] = list(pd.to_numeric(ids))

### Duplicates
Check if any student answered the survey twice

In [7]:
# Check duplicate students within EE or CG
no_duplicates = True
for d in data_all:
    students = set(d["StudentID"])
    if len(students) != len(d):
        print("One or more students responded in the survey multiple times within the same group.")
        
        # TODO: code for handling duplicate 
        no_duplicates = False

if no_duplicates:
    print("No duplicate students found within the EE or CG groups.")

No duplicate students found within the EE or CG groups.


In [8]:
# Check duplicate students across EE and CG
no_duplicates = True
all_students = set(pd.concat([ee_data["StudentID"], cg_data["StudentID"]]))
if len(all_students) != len(ee_data) + len(cg_data):
    print("One or more students responded in both EE and CG surveys")
    
    # TODO: code for handling duplicate students
    no_duplicates = False

if no_duplicates:
    print("No duplicate students found across the EE and CG groups.")

No duplicate students found across the EE and CG groups.


### Student validation
Verify that students who answered survey are enrolled in the courses they selected in the survey.

1. EE data

In [9]:
for i, row in ee_data.iterrows():
    id = row["StudentID"][""]
    for course in courses_ee:
        course_name = "COMP" + str(course)
        if row["EE course"][course_name] == 1:
            if id not in student_ids[course]:
                print(f"Student doesn't exist") # modify to add student ID

Student doesn't exist


2. CG data

In [10]:
for i, row in cg_data.iterrows():
    id = row["StudentID"][""]
    for course in courses_cg:
        course_name = "COMP" + str(course)
        if row["CG course"][course_name] == 1:
            if id not in student_ids[course]:
                print(f"Student doesn't exist") # modify to add student ID

Student doesn't exist


### Check how many students attended more than one lecture

In [11]:
sum_of_courses_ee = ee_data["EE course"].sum(axis=1)
len(sum_of_courses_ee[sum_of_courses_ee>1])

2

### Check if selected courses match survey times

In [13]:
ee_data["Start time"] = pd.to_datetime(ee_data["Start time"])
ee_data["Completion time"] = pd.to_datetime(ee_data["Completion time"])

cg_data["Start time"] = pd.to_datetime(cg_data["Start time"])
cg_data["Completion time"] = pd.to_datetime(cg_data["Completion time"])

In [None]:
import datetime
time_551 = datetime.datetime(2025, 11, 24, 2, 30)
time_424 = datetime.datetime(2025, 12, 1, 1, 30)
time_251 = datetime.datetime(2025, 12, 2, 2, 30)

In [25]:
ee_data[ee_data[("EE course", "COMP251")] == 1]["Start time"] < time_251

3      True
7     False
28    False
31    False
34    False
39    False
42    False
43    False
47    False
48    False
50    False
Name: Start time, dtype: bool

In [27]:
ee_data[ee_data[("EE course", "COMP424")] == 1]["Start time"] < time_424

3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
16    False
17    False
19    False
24    False
25    False
26    False
27    False
29    False
30    False
32    False
33    False
52    False
Name: Start time, dtype: bool

In [26]:
ee_data[ee_data[("EE course", "COMP551")] == 1]["Start time"] < time_551

1     False
2     False
11    False
13    False
14    False
18    False
20    False
21    False
23    False
35    False
36    False
37    False
38    False
40    False
41    False
45    False
46    False
49    False
51    False
Name: Start time, dtype: bool