In [1]:
import numpy as np
import pandas as pd
import scipy.stats

pd.options.mode.chained_assignment = None
from matplotlib import pyplot as plt, rcParams
# import cv2
import seaborn as sns

sns.set(style="white", context="paper")
from cycler import cycler
import os, sys
import glob
from datetime import datetime, timedelta
from itertools import combinations
import base64
from PIL import Image
from io import BytesIO as _BytesIO
import requests
import json
import pickle
from datetime import datetime
from IPython.display import display, Markdown, Latex
from sklearn.metrics import *
import collections
from copy import deepcopy
import traceback
# import plotly
# from pandas_profiling import ProfileReport

pd.options.display.max_columns = None


def printm(s): return display(Markdown(s))



# Figure out how many sessions are there, and how many frames for each session

In [2]:
track_analysis_meta_cache = '../cache/analysis_tracking/meta_info'
os.makedirs(track_analysis_meta_cache,exist_ok=True)

In [3]:
base_dir = '/mnt/ci-nas-cache/edulyzeV2/track/'
frame_file_data = {}
for course_idx, course_dir in enumerate(glob.glob(f"{base_dir}/*")):
    course_name = course_dir.split("/")[-1]
    course_cache_file = f"{track_analysis_meta_cache}/{course_name}"
    if os.path.exists(course_cache_file):
        frame_file_data[course_name] = pickle.load(open(course_cache_file,"rb"))
        continue
    frame_file_data[course_name]={}
        
    for session_idx, session_dir in enumerate(glob.glob(f"{course_dir}/*")):
        session_name = session_dir.split("/")[-1]
        frame_file_data[course_name][session_name] = {}
        frame_files = glob.glob(f"{session_dir}/*")
        frame_file_names = [xr.split("/")[-1] for xr in frame_files]
        if 'end.pb' in frame_file_names:
            frame_file_data[course_name][session_name]['is_completed']=True
        else:
            frame_file_data[course_name][session_name]['is_completed']=False            
        frame_ids = [int(xr.split(".")[0]) for xr in frame_file_names if not (xr=='end.pb')]
        frame_file_data[course_name][session_name]['frame_ids'] = sorted(frame_ids)
        frame_file_data[course_name][session_name]['dir_location'] = session_dir
        print(f"Got metadata for course: {course_idx}-{course_name}, session:{session_idx}-{session_name}")
    pickle.dump(frame_file_data[course_name],open(course_cache_file,"wb")) 
        
        
        

In [4]:
frame_file_data.keys()

dict_keys(['17214A', '17214B', '17214C', '05391A', '05410A', '6485A', '17214E', '17214D', '21127J', '21127L', '15819AA', '18715A', '79240A', '82119A', '24352A', '24352B', '24352C', '6705A', '9403A', '19603A', '79388A', '86375A', '9214L', '09217S', '15424A', '15740A', '19646B1', '82235A', '12411B', '17213E', '05682A', '09105S', '21122S', '21260S', '36200S', '57173S', '73102S', '76270S', '79201S', '82101S', '82271S', '05410B', '05418A', '05681A', '05748A', '05772A', '09105A', '09105C', '09105D', '15251C', '15251I', '17346A', '17356A', '21127H', '17356L1'])

In [5]:
# course_name, session_name, is_completed, min_frame, max_frame_number, num_frames, session_duration_mins, 
# Next step: Get how many sessions are completed, and 
session_infos = []
for course_name in frame_file_data:
    session_meta_data = {'course':course_name}
    for session_name in frame_file_data[course_name]:
        # print(session_name)
        session_meta_data['session']=session_name.split("-front")[0]
        try:
            session_meta_data['start_time'] = pd.to_datetime(session_meta_data['session'].split("_")[-1], format="%Y%m%d%H%M")
        except:
            session_meta_data['start_time'] = pd.to_datetime(session_meta_data['session'].split("_")[-1], format="%Y%m%d%H%M%S")
        session_meta_data['start_hour'] = session_meta_data['start_time'].strftime('%H:%M')
        session_meta_data['classroom'] = "_".join(session_meta_data['session'].split("_")[-3:-1])
        meta_data = frame_file_data[course_name][session_name]
        session_meta_data['completed'] = meta_data['is_completed']
        session_meta_data['num_frames'] = len(meta_data['frame_ids'])
        if len(meta_data['frame_ids']) > 0:
            session_meta_data['max_frame'] = max(meta_data['frame_ids'])
            session_meta_data['duration_in_mins'] = max(meta_data['frame_ids'])//(15*60)
        session_infos.append(session_meta_data)
        # print(course_name, session_name)
df_session_info = pd.DataFrame(session_infos)   

In [6]:
session_infos[-1]

{'course': '17356L1',
 'session': 'classinsight-cmu_17356L1_weh_5302_201905021030',
 'start_time': Timestamp('2019-05-02 10:30:00'),
 'start_hour': '10:30',
 'classroom': 'weh_5302',
 'completed': True,
 'num_frames': 23999,
 'max_frame': 71994,
 'duration_in_mins': 79}

In [7]:
df_session_info.head()

Unnamed: 0,course,session,start_time,start_hour,classroom,completed,num_frames,max_frame,duration_in_mins
0,17214A,classinsight-cmu_17214A_dh_2105_201905010930,2019-05-01 09:30:00,09:30,dh_2105,False,0,,
1,17214A,classinsight-cmu_17214A_dh_2105_201905010930,2019-05-01 09:30:00,09:30,dh_2105,False,0,,
2,17214A,classinsight-cmu_17214A_dh_2105_201905010930,2019-05-01 09:30:00,09:30,dh_2105,False,0,,
3,17214A,classinsight-cmu_17214A_dh_2105_201905010930,2019-05-01 09:30:00,09:30,dh_2105,False,0,,
4,17214A,classinsight-cmu_17214A_dh_2105_201905010930,2019-05-01 09:30:00,09:30,dh_2105,False,0,,


In [8]:
df_session_info = df_session_info[df_session_info.completed]
df_session_info['course_num'] = df_session_info.course.apply(lambda x: int(x[:5]) if len(x)>5 else int(x[:4]))
df_session_info['course_num'].value_counts()

9105     104
21127     85
5410      56
6705      41
24352     40
17356     37
17214     36
5748      28
5418      27
5391      27
15251     26
76270     20
82271     17
21260     16
5681      15
73102     14
79201     14
9214      13
15424     13
57173     13
36200     10
15740     10
9403      10
82235      9
5772       8
6485       6
79388      6
86375      5
17346      5
12411      4
19646      3
17213      1
Name: course_num, dtype: int64

# Include course meta info 

In [9]:
df_course_info = pd.read_csv('course_descriptions_1.csv')
df_course_info2 = pd.read_csv('course_descriptions_2.csv')
df_course_info = pd.concat([df_course_info,df_course_info2],ignore_index=True)
df_course_info = df_course_info[~df_course_info.course_name.isnull()]
df_course_info.info()
df_course_info.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63 entries, 0 to 64
Data columns (total 19 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   S.No                                                 63 non-null     int64  
 1   Semester                                             51 non-null     object 
 2   Division                                             51 non-null     object 
 3   Dept                                                 51 non-null     object 
 4   course_num                                           63 non-null     int64  
 5   course_name                                          63 non-null     object 
 6   Course Level                                         51 non-null     object 
 7   Hrs Per Week                                         51 non-null     float64
 8   course_category                                      62 non-null     obj

Unnamed: 0,S.No,Semester,Division,Dept,course_num,course_name,Course Level,Hrs Per Week,course_category,description,Interest in student learning,Clearly explain course requirements,Clear learning objectives & goals,Instructor provides feedback to students to improve,Demonstrate importance of subject matter,Explains subject matter of course,Show respect for all students,Overall teaching rate,Overall course rate
0,1,Fall,Carnegie Institute of Technology,CHE,6705,ADV CHE THRMODYNMCS,Graduate,8.16,theory,Advanced application of the general thermodyna...,4.42,4.63,4.74,4.05,4.58,4.63,4.95,4.47,4.37
1,2,Fall,Mellon College of Science,CMY,9214,PHYSICAL CHEMISTRY,Undergraduate,6.5,theory,This is a one-semester course intended primari...,5.0,4.5,4.5,4.5,5.0,4.5,5.0,4.5,4.5
2,3,Fall,Mellon College of Science,CMY,9403,CMY ADDIC,Undergraduate,5.75,theory,What makes us need something so much that it e...,5.0,4.75,5.0,5.0,5.0,4.88,5.0,5.0,4.88
3,4,Fall,Carnegie Institute of Technology,CEE,12411,PRJ MANGMT CONSTRCTN,Undergraduate,7.46,applied,"Through planning and management, and optimizat...",4.34,3.98,4.05,3.89,4.48,4.02,4.69,3.98,3.72
4,5,Fall,School of Computer Science,CS,15251,GRT IDEAS THERTCL CS,Undergraduate,14.27,theory,This course is about how to use theoretical id...,4.71,4.69,4.72,4.53,4.72,4.72,4.81,4.67,4.72


In [10]:
df_session_info = pd.merge(df_session_info, df_course_info, on=['course_num'])

In [11]:
df_session_info.head()

Unnamed: 0,course,session,start_time,start_hour,classroom,completed,num_frames,max_frame,duration_in_mins,course_num,S.No,Semester,Division,Dept,course_name,Course Level,Hrs Per Week,course_category,description,Interest in student learning,Clearly explain course requirements,Clear learning objectives & goals,Instructor provides feedback to students to improve,Demonstrate importance of subject matter,Explains subject matter of course,Show respect for all students,Overall teaching rate,Overall course rate
0,17214B,classinsight-cmu_17214B_ph_a21_201905011030,2019-05-01 10:30:00,10:30,ph_a21,True,14999,44994.0,49.0,17214,9,Fall,School of Computer Science,ISR,PRIN 0-0 SFTWR CONST,Undergraduate,14.51,applied,Software engineers today are less likely to de...,4.27,4.24,4.38,4.04,4.36,4.16,4.4,4.13,4.13
1,17214B,classinsight-cmu_17214B_ph_a21_201905011030,2019-05-01 10:30:00,10:30,ph_a21,True,14999,44994.0,49.0,17214,28,Spring,School of Computer Science,SE,PRIN 0-0 SFTWR CONST,Undergraduate,13.09,applied,Software engineers today are less likely to de...,4.39,4.18,4.48,3.88,4.51,4.38,4.86,4.26,4.26
2,17214B,classinsight-cmu_17214B_ph_a21_201905011030,2019-05-01 10:30:00,10:30,ph_a21,True,14999,44994.0,49.0,17214,9,Fall,School of Computer Science,ISR,PRIN 0-0 SFTWR CONST,Undergraduate,14.51,applied,Software engineers today are less likely to de...,4.27,4.24,4.38,4.04,4.36,4.16,4.4,4.13,4.13
3,17214B,classinsight-cmu_17214B_ph_a21_201905011030,2019-05-01 10:30:00,10:30,ph_a21,True,14999,44994.0,49.0,17214,28,Spring,School of Computer Science,SE,PRIN 0-0 SFTWR CONST,Undergraduate,13.09,applied,Software engineers today are less likely to de...,4.39,4.18,4.48,3.88,4.51,4.38,4.86,4.26,4.26
4,17214B,classinsight-cmu_17214B_ph_a21_201905011030,2019-05-01 10:30:00,10:30,ph_a21,True,14999,44994.0,49.0,17214,9,Fall,School of Computer Science,ISR,PRIN 0-0 SFTWR CONST,Undergraduate,14.51,applied,Software engineers today are less likely to de...,4.27,4.24,4.38,4.04,4.36,4.16,4.4,4.13,4.13


In [20]:
df_session_info[df_session_info.course=='21127L']

Unnamed: 0,course,session,start_time,start_hour,classroom,completed,num_frames,max_frame,duration_in_mins,course_num,S.No,Semester,Division,Dept,course_name,Course Level,Hrs Per Week,course_category,description,Interest in student learning,Clearly explain course requirements,Clear learning objectives & goals,Instructor provides feedback to students to improve,Demonstrate importance of subject matter,Explains subject matter of course,Show respect for all students,Overall teaching rate,Overall course rate
239,21127L,classinsight-cmu_21127L_ph_a22_201905021630,2019-05-02 16:30:00,16:30,ph_a22,True,14999,44994.0,49.0,21127,13,Fall,Mellon College of Science,MSC,CONCEPTS OF MATHMTCS,Undergraduate,12.60,theory,"This course introduces the basic concepts, ide...",4.04,4.27,4.25,3.73,4.05,4.16,4.15,4.06,3.92
240,21127L,classinsight-cmu_21127L_ph_a22_201905021630,2019-05-02 16:30:00,16:30,ph_a22,True,14999,44994.0,49.0,21127,31,Spring,Mellon College of Science,MSC,CONCEPTS OF MATHMTCS,Undergraduate,11.17,theory,"This course introduces the basic concepts, ide...",4.34,4.42,4.37,3.98,4.33,4.31,4.56,4.29,4.18
241,21127L,classinsight-cmu_21127L_ph_a22_201905021630,2019-05-02 16:30:00,16:30,ph_a22,True,14999,44994.0,49.0,21127,42,Summer,Mellon College of Science,MSC,CONCEPTS OF MATHMTCS,Undergraduate,17.12,theory,"This course introduces the basic concepts, ide...",4.76,4.76,4.82,4.41,4.79,4.79,4.68,4.76,4.62
242,21127L,classinsight-cmu_21127L_ph_a22_201905021630,2019-05-02 16:30:00,16:30,ph_a22,True,14999,44994.0,49.0,21127,13,Fall,Mellon College of Science,MSC,CONCEPTS OF MATHMTCS,Undergraduate,12.60,theory,"This course introduces the basic concepts, ide...",4.04,4.27,4.25,3.73,4.05,4.16,4.15,4.06,3.92
243,21127L,classinsight-cmu_21127L_ph_a22_201905021630,2019-05-02 16:30:00,16:30,ph_a22,True,14999,44994.0,49.0,21127,31,Spring,Mellon College of Science,MSC,CONCEPTS OF MATHMTCS,Undergraduate,11.17,theory,"This course introduces the basic concepts, ide...",4.34,4.42,4.37,3.98,4.33,4.31,4.56,4.29,4.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,21127L,classinsight-cmu_21127L_ph_a22_201905021630,2019-05-02 16:30:00,16:30,ph_a22,True,14999,44994.0,49.0,21127,31,Spring,Mellon College of Science,MSC,CONCEPTS OF MATHMTCS,Undergraduate,11.17,theory,"This course introduces the basic concepts, ide...",4.34,4.42,4.37,3.98,4.33,4.31,4.56,4.29,4.18
319,21127L,classinsight-cmu_21127L_ph_a22_201905021630,2019-05-02 16:30:00,16:30,ph_a22,True,14999,44994.0,49.0,21127,42,Summer,Mellon College of Science,MSC,CONCEPTS OF MATHMTCS,Undergraduate,17.12,theory,"This course introduces the basic concepts, ide...",4.76,4.76,4.82,4.41,4.79,4.79,4.68,4.76,4.62
320,21127L,classinsight-cmu_21127L_ph_a22_201905021630,2019-05-02 16:30:00,16:30,ph_a22,True,14999,44994.0,49.0,21127,13,Fall,Mellon College of Science,MSC,CONCEPTS OF MATHMTCS,Undergraduate,12.60,theory,"This course introduces the basic concepts, ide...",4.04,4.27,4.25,3.73,4.05,4.16,4.15,4.06,3.92
321,21127L,classinsight-cmu_21127L_ph_a22_201905021630,2019-05-02 16:30:00,16:30,ph_a22,True,14999,44994.0,49.0,21127,31,Spring,Mellon College of Science,MSC,CONCEPTS OF MATHMTCS,Undergraduate,11.17,theory,"This course introduces the basic concepts, ide...",4.34,4.42,4.37,3.98,4.33,4.31,4.56,4.29,4.18


In [12]:
df_course_info = df_session_info.groupby(['course','Semester','Course Level','course_category','Division','start_hour']).agg({
    'completed':'count',
    'duration_in_mins':['min','mean','max'],
    'num_frames':['min','mean','max'],}).reset_index()
df_course_info.columns = ['course','Semester','Course Level','course_category','Division','start_hour','num_sessions','dur_min','dur_mean','dur_max','frame_count_min','frame_count_mean','frame_count_max']
df_course_info= df_course_info[df_course_info.dur_mean > 25]
df_course_info.sort_values(by='start_hour',ascending=False).reset_index(drop=True)

Unnamed: 0,course,Semester,Course Level,course_category,Division,start_hour,num_sessions,dur_min,dur_mean,dur_max,frame_count_min,frame_count_mean,frame_count_max
0,21127L,Summer,Undergraduate,theory,Mellon College of Science,16:30,28,49.0,49.0,49.0,14999,14999.0,14999
1,21127L,Spring,Undergraduate,theory,Mellon College of Science,16:30,28,49.0,49.0,49.0,14999,14999.0,14999
2,21127L,Fall,Undergraduate,theory,Mellon College of Science,16:30,28,49.0,49.0,49.0,14999,14999.0,14999
3,24352C,Spring,Undergraduate,applied,Carnegie Institute of Technology,15:30,14,49.0,49.0,49.0,14997,14997.0,14997
4,24352C,Fall,Undergraduate,applied,Carnegie Institute of Technology,15:30,14,49.0,49.0,49.0,14997,14997.0,14997
5,24352B,Spring,Undergraduate,applied,Carnegie Institute of Technology,14:30,14,49.0,49.0,49.0,14999,14999.0,14999
6,24352B,Fall,Undergraduate,applied,Carnegie Institute of Technology,14:30,14,49.0,49.0,49.0,14999,14999.0,14999
7,24352A,Spring,Undergraduate,applied,Carnegie Institute of Technology,13:30,12,49.0,49.0,49.0,14998,14998.0,14998
8,17214E,Fall,Undergraduate,applied,School of Computer Science,13:30,14,49.0,49.0,49.0,14999,14999.0,14999
9,17214E,Spring,Undergraduate,applied,School of Computer Science,13:30,14,49.0,49.0,49.0,14999,14999.0,14999


In [13]:
# 15 courses: 1.5 hours
# 18 courses: 1 hour
# 2 courses: half hour


# Cache single session tracking for id consistency

In [14]:
# sample_course = '17214B'
# sample_session = 'classinsight-cmu_17214B_ph_a21_201905011030-front'
# sample_session_dir = frame_file_data[sample_course][sample_session]['dir_location']
# sample_frame_ids = frame_file_data[sample_course][sample_session]['frame_ids']
# sample_session_dir

In [15]:
# session_tracking_ids = {}
# for frame_id in sample_frame_ids:
#     frame_data = pickle.load(open(f'{sample_session_dir}/{frame_id}.pb','rb'))
#     frame_tracking_ids = [xr['track_id'] for xr in frame_data[1]]
#     # print(frame_id, frame_tracking_ids)
#     session_tracking_ids[frame_id] = {int(xr):1 for xr in frame_tracking_ids}
# df_session_ids = pd.DataFrame.from_dict(session_tracking_ids)

In [16]:
# df_session_ids.head()

In [17]:
# np.unique(np.nansum(df_session_ids.values,axis=1) // 5, return_counts=True)

In [18]:
session_tracking_cache = '../cache/analysis_tracking/session_tracking_info'


In [19]:
unfinished_sessions = []
for course_idx, course in enumerate(frame_file_data):
    for session_idx, session_id in enumerate(frame_file_data[course]):
        session_tracking_cache_file = f"{session_tracking_cache}/{session_id}.pb"
        try:
            if not os.path.exists(session_tracking_cache_file):
                session_dir = frame_file_data[course][session_id]['dir_location']
                frame_ids = frame_file_data[course][session_id]['frame_ids']
                session_tracking_ids = {}
                for frame_id in frame_ids:
                    frame_data = pickle.load(open(f'{session_dir}/{frame_id}.pb','rb'))
                    frame_tracking_ids = [xr['track_id'] for xr in frame_data[1]]
                    # print(frame_id, frame_tracking_ids)
                    session_tracking_ids[frame_id] = {int(xr):1 for xr in frame_tracking_ids}
                df_session_ids = pd.DataFrame.from_dict(session_tracking_ids)
                pickle.dump(df_session_ids, open(session_tracking_cache_file,'wb'))
                print(f"Got tracking info for session: {course_idx}-{course}, session:{session_idx}-{session_id}")
            else:
                ...
                # print(f"FILE EXISTS: tracking info for session: {course_idx}-{course}, session:{session_idx}-{session_id}")
        except:
            print(f"ERROR: Unable to get session tracking for: {course_idx}-{course}, session:{session_idx}-{session_id}")
            unfinished_sessions.append((course, session_id))
            print(traceback.format_exc())
        

Got tracking info for session: 45-05772A, session:1-classinsight-cmu_05772A_ghc_4301_201902051030-front
Got tracking info for session: 45-05772A, session:3-classinsight-cmu_05772A_ghc_4301_201902121030-front
Got tracking info for session: 46-09105A, session:0-classinsight-cmu_09105A_ghc_4102_201901291830-front
Got tracking info for session: 46-09105A, session:11-classinsight-cmu_09105A_ghc_4102_201903071830-front
Got tracking info for session: 46-09105A, session:12-classinsight-cmu_09105A_ghc_4102_201903121830-front
Got tracking info for session: 46-09105A, session:15-classinsight-cmu_09105A_ghc_4102_201903211830-front
Got tracking info for session: 46-09105A, session:27-classinsight-cmu_09105A_ghc_4102_201905021830-front
Got tracking info for session: 47-09105C, session:1-classinsight-cmu_09105C_ghc_5222_201905021830-front
Got tracking info for session: 47-09105C, session:3-classinsight-cmu_09105C_ghc_5222_201902051830-front
Got tracking info for session: 47-09105C, session:5-classins

# Setup tracking id experiment by random selection of courses, sessions and snippets

In [26]:
df_session_info.head()

Unnamed: 0,course,session,start_time,start_hour,classroom,completed,num_frames,max_frame,duration_in_mins,course_num,S.No,Semester,Division,Dept,course_name,Course Level,Hrs Per Week,course_category,description,Interest in student learning,Clearly explain course requirements,Clear learning objectives & goals,Instructor provides feedback to students to improve,Demonstrate importance of subject matter,Explains subject matter of course,Show respect for all students,Overall teaching rate,Overall course rate
0,17214B,classinsight-cmu_17214B_ph_a21_201905011030,2019-05-01 10:30:00,10:30,ph_a21,True,14999,44994.0,49.0,17214,9,Fall,School of Computer Science,ISR,PRIN 0-0 SFTWR CONST,Undergraduate,14.51,applied,Software engineers today are less likely to de...,4.27,4.24,4.38,4.04,4.36,4.16,4.4,4.13,4.13
1,17214B,classinsight-cmu_17214B_ph_a21_201905011030,2019-05-01 10:30:00,10:30,ph_a21,True,14999,44994.0,49.0,17214,28,Spring,School of Computer Science,SE,PRIN 0-0 SFTWR CONST,Undergraduate,13.09,applied,Software engineers today are less likely to de...,4.39,4.18,4.48,3.88,4.51,4.38,4.86,4.26,4.26
2,17214B,classinsight-cmu_17214B_ph_a21_201905011030,2019-05-01 10:30:00,10:30,ph_a21,True,14999,44994.0,49.0,17214,9,Fall,School of Computer Science,ISR,PRIN 0-0 SFTWR CONST,Undergraduate,14.51,applied,Software engineers today are less likely to de...,4.27,4.24,4.38,4.04,4.36,4.16,4.4,4.13,4.13
3,17214B,classinsight-cmu_17214B_ph_a21_201905011030,2019-05-01 10:30:00,10:30,ph_a21,True,14999,44994.0,49.0,17214,28,Spring,School of Computer Science,SE,PRIN 0-0 SFTWR CONST,Undergraduate,13.09,applied,Software engineers today are less likely to de...,4.39,4.18,4.48,3.88,4.51,4.38,4.86,4.26,4.26
4,17214B,classinsight-cmu_17214B_ph_a21_201905011030,2019-05-01 10:30:00,10:30,ph_a21,True,14999,44994.0,49.0,17214,9,Fall,School of Computer Science,ISR,PRIN 0-0 SFTWR CONST,Undergraduate,14.51,applied,Software engineers today are less likely to de...,4.27,4.24,4.38,4.04,4.36,4.16,4.4,4.13,4.13


In [24]:
# randomly select 10 courses.
np.random.seed(43)
prefiltered_courses = df_session_info.course.drop_duplicates().sample(n=10)
print(f"Prefiltered courses:{prefiltered_courses}")

# select 5 sessions per course 
df_prefiltered_sessions = pd.DataFrame()
for course in prefiltered_courses.values:
    if df_session_info[df_session_info.course==course].shape[0]>5:
        df_prefiltered_sessions.append(df_session_info[df_session_info.course==course].sample(n=5))
df_prefiltered_sessions

Prefiltered courses:541     79388A
127     05410B
597     12411B
753     36200S
155     21127J
676     09105D
239     21127L
851     76270S
619     09105A
1120    17346A
Name: course, dtype: object


In [25]:
df_prefiltered_sessions