# Data Import

In [None]:
!free -h

               total        used        free      shared  buff/cache   available
Mem:            12Gi       2.8Gi       6.7Gi       2.0Mi       3.2Gi       9.6Gi
Swap:             0B          0B          0B


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
import pandas as pd
import numpy as np
from statistics import mean
import math

In [None]:
user_data = pd.read_json('/content/drive/MyDrive/MOOCCube/entities/user.json', lines=True)
user_data

Unnamed: 0,id,name,course_order,enroll_time
0,U_7001215,李喜锋,"[C_course-v1:TsinghuaX+00740043_2x_2015_T2+sp,...","[2017-05-01 11:07:53, 2017-05-17 10:07:17, 201..."
1,U_10402446,五元香,"[C_course-v1:TsinghuaX+00510888X+2019_T1, C_co...","[2019-06-14 08:50:04, 2019-01-04 20:36:07]"
2,U_10359065,魏珊,"[C_course-v1:TsinghuaX+00612642X+sp, C_course-...","[2019-01-18 21:19:56, 2019-01-14 21:54:54]"
3,U_7423998,郭海滨,"[C_course-v1:TsinghuaX+30240184_2X+sp, C_cours...","[2017-08-16 10:38:11, 2018-07-01 18:24:24, 201..."
4,U_545306,李其艳,"[C_course-v1:TsinghuaX+20430064_2X+sp, C_cours...","[2018-09-05 15:40:40, 2019-02-28 10:08:49, 201..."
...,...,...,...,...
199194,U_9447602,羊舌半凡,"[C_course-v1:TsinghuaX+00612642X+sp, C_course-...","[2018-07-08 15:02:53, 2018-08-24 10:04:58, 201..."
199195,U_7517918,焦彭越,"[C_course-v1:TsinghuaX+20250064+sp, C_course-v...","[2018-05-06 16:37:18, 2017-08-28 18:10:53]"
199196,U_8665537,锺才俊,"[C_course-v1:TsinghuaX+30240243X+sp, C_course-...","[2018-03-12 11:02:22, 2018-05-03 10:29:23]"
199197,U_10621245,忻盼曼,"[C_course-v1:TsinghuaX+30700313X+2019_T1, C_co...","[2019-03-19 13:28:00, 2019-04-23 20:28:01, 201..."


Import main data which come from user entities. The historical sequence of course for each user will be the main input or data that will be used for modelling.

# Data Understanding and Preprocessing - Model Evaluation

In [None]:
user_data.shape

(199199, 4)

In [None]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199199 entries, 0 to 199198
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            199199 non-null  object
 1   name          199199 non-null  object
 2   course_order  199199 non-null  object
 3   enroll_time   199199 non-null  object
dtypes: object(4)
memory usage: 6.1+ MB


## Missing data

In [None]:
user_data.isnull().sum()

id              0
name            0
course_order    0
enroll_time     0
dtype: int64

In [None]:
user_data.isnull().sum().any()

False

In [None]:
user_data.isnull().sum()

id              0
name            0
course_order    0
enroll_time     0
dtype: int64

In [None]:
user_data.isnull().sum().any()

False

No missing data

## Check the number of courses taken for each student

In [None]:
num_course = []

for i in range(len(user_data)):
  num_course.append(len(user_data['course_order'][i]))

user_data['number_of_course'] = num_course

In [None]:
user_data

Unnamed: 0,id,name,course_order,enroll_time,number_of_course
0,U_7001215,李喜锋,"[C_course-v1:TsinghuaX+00740043_2x_2015_T2+sp,...","[2017-05-01 11:07:53, 2017-05-17 10:07:17, 201...",5
1,U_10402446,五元香,"[C_course-v1:TsinghuaX+00510888X+2019_T1, C_co...","[2019-06-14 08:50:04, 2019-01-04 20:36:07]",2
2,U_10359065,魏珊,"[C_course-v1:TsinghuaX+00612642X+sp, C_course-...","[2019-01-18 21:19:56, 2019-01-14 21:54:54]",2
3,U_7423998,郭海滨,"[C_course-v1:TsinghuaX+30240184_2X+sp, C_cours...","[2017-08-16 10:38:11, 2018-07-01 18:24:24, 201...",7
4,U_545306,李其艳,"[C_course-v1:TsinghuaX+20430064_2X+sp, C_cours...","[2018-09-05 15:40:40, 2019-02-28 10:08:49, 201...",10
...,...,...,...,...,...
199194,U_9447602,羊舌半凡,"[C_course-v1:TsinghuaX+00612642X+sp, C_course-...","[2018-07-08 15:02:53, 2018-08-24 10:04:58, 201...",5
199195,U_7517918,焦彭越,"[C_course-v1:TsinghuaX+20250064+sp, C_course-v...","[2018-05-06 16:37:18, 2017-08-28 18:10:53]",2
199196,U_8665537,锺才俊,"[C_course-v1:TsinghuaX+30240243X+sp, C_course-...","[2018-03-12 11:02:22, 2018-05-03 10:29:23]",2
199197,U_10621245,忻盼曼,"[C_course-v1:TsinghuaX+30700313X+2019_T1, C_co...","[2019-03-19 13:28:00, 2019-04-23 20:28:01, 201...",5


## Filter Course Sequence

In [None]:
user_data_range = user_data[(user_data["number_of_course"] >= 5) & (user_data["number_of_course"] <= 10) ]

Use appropriate minum and maximum number of courses for simplicity. I choose to have courses sequence within 5 to 10 courses for each student

In [None]:
user_data_range

Unnamed: 0,id,name,course_order,enroll_time,number_of_course
0,U_7001215,李喜锋,"[C_course-v1:TsinghuaX+00740043_2x_2015_T2+sp,...","[2017-05-01 11:07:53, 2017-05-17 10:07:17, 201...",5
3,U_7423998,郭海滨,"[C_course-v1:TsinghuaX+30240184_2X+sp, C_cours...","[2017-08-16 10:38:11, 2018-07-01 18:24:24, 201...",7
4,U_545306,李其艳,"[C_course-v1:TsinghuaX+20430064_2X+sp, C_cours...","[2018-09-05 15:40:40, 2019-02-28 10:08:49, 201...",10
7,U_7594103,巴鹏,"[C_course-v1:TsinghuaX+30240184+sp, C_course-v...","[2018-07-15 20:45:57, 2018-01-18 20:59:34, 201...",6
13,U_3234246,称语山,"[C_course-v1:TsinghuaX+00740123_X+sp, C_course...","[2017-02-27 22:15:55, 2018-12-08 20:23:01, 201...",5
...,...,...,...,...,...
199185,U_6380402,梁安康,"[C_course-v1:XJTU+C00204+sp, C_course-v1:Tsing...","[2018-09-19 16:51:11, 2017-09-20 11:11:01, 201...",6
199193,U_9447603,薛烨烨,"[C_course-v1:TsinghuaX+64100033X+SP, C_course-...","[2018-07-19 18:25:08, 2019-03-10 21:03:39, 201...",8
199194,U_9447602,羊舌半凡,"[C_course-v1:TsinghuaX+00612642X+sp, C_course-...","[2018-07-08 15:02:53, 2018-08-24 10:04:58, 201...",5
199197,U_10621245,忻盼曼,"[C_course-v1:TsinghuaX+30700313X+2019_T1, C_co...","[2019-03-19 13:28:00, 2019-04-23 20:28:01, 201...",5


## Data sample

In [None]:
user_data_sample = user_data_range.sample(frac=0.3, random_state = 41)

In [None]:
user_data_sample

Unnamed: 0,id,name,course_order,enroll_time,number_of_course
6865,U_758099,漆玉,"[C_course-v1:TsinghuaX+00740043X_2015_T2+sp, C...","[2017-10-29 22:19:03, 2016-01-08 13:20:08, 201...",9
156830,U_9265115,太叔紫翠,"[C_course-v1:TsinghuaX+10430484X_2015_2+sp, C_...","[2018-06-14 18:08:34, 2018-09-09 20:12:24, 201...",6
57340,U_6915226,麦凝荷,"[C_course-v1:TsinghuaX+00690212X+sp, C_course-...","[2017-08-21 19:59:48, 2019-05-26 19:59:59, 201...",5
126720,U_2548979,赛寻双,"[C_course-v1:TsinghuaX+10421075X_2015_2+sp, C_...","[2018-05-02 10:12:06, 2017-07-04 08:21:05, 201...",8
93893,U_7150581,那妙竹,"[C_course-v1:TsinghuaX+40050444X+sp, C_course-...","[2017-08-13 19:21:01, 2018-03-16 09:45:49, 201...",5
...,...,...,...,...,...
58752,U_6635930,门元恺,"[C_course-v1:TsinghuaX+Pr20170406-SC_p1+sp, C_...","[2017-08-14 10:53:21, 2017-08-04 07:36:53, 201...",8
73942,U_3133278,声谷,"[C_course-v1:TsinghuaX+30700313X+sp, C_course-...","[2019-08-12 15:56:34, 2019-04-20 20:24:58, 201...",6
118769,U_1976347,似初南,"[C_course-v1:CUNY+20171211001+sp, C_course-v1:...","[2019-05-19 10:19:37, 2018-11-16 17:07:59, 201...",6
99802,U_5325210,陈承,"[C_course-v1:TsinghuaX+70250023X_2015_2+sp, C_...","[2016-11-10 15:26:17, 2017-03-23 22:54:06, 201...",9


Data is further filtered by sampling, only take 30% from 30K rows.

In [None]:
def v_counts(dataframe):
    for i in dataframe :
        print(dataframe[i].value_counts())
        print("_____________________________________________________________________________")

v_counts(user_data_sample)

U_758099      1
U_9005717     1
U_9589535     1
U_8386334     1
U_101472      1
             ..
U_10443694    1
U_9889935     1
U_160094      1
U_459021      1
U_8028961     1
Name: id, Length: 8961, dtype: int64
_____________________________________________________________________________
漆玉     1
丁永桂    1
空锐立    1
厚天空    1
融鸿宝    1
      ..
庚欣嘉    1
仙信厚    1
陈令山    1
英凝雁    1
禽瀚漠    1
Name: name, Length: 8961, dtype: int64
_____________________________________________________________________________
[C_course-v1:MITx+6_00_1x+sp, C_course-v1:TsinghuaX+00740123_X+sp, C_course-v1:TsinghuaX+30240243X+sp, C_course-v1:TsinghuaX+30240184+sp, C_course-v1:TsinghuaX+30240184_2X+sp]                                                                                                                                                                                                              2
[C_course-v1:TsinghuaX+20250103X+sp, C_course-v1:TsinghuaX+20250064X+sp, C_course-v1:TsinghuaX+20250064+sp, C_

## Course Sequence Gathering and Transfomration

In [None]:
course_order_list = user_data_sample["course_order"].tolist()
course_order_list

[['C_course-v1:TsinghuaX+00740043X_2015_T2+sp',
  'C_course-v1:TsinghuaX+10421084X_2015_2+sp',
  'C_course-v1:TsinghuaX+00690092X+sp',
  'C_course-v1:TsinghuaX+30240243X+sp',
  'C_course-v1:TsinghuaX+10430484X_2015_2+sp',
  'C_course-v1:TsinghuaX+20430064_2X+sp',
  'C_course-v1:TsinghuaX+10430494X_2015_2+sp',
  'C_course-v1:TsinghuaX+20430064X+sp',
  'C_course-v1:TsinghuaX+00740043_2x_2015_T2+sp'],
 ['C_course-v1:TsinghuaX+10430484X_2015_2+sp',
  'C_course-v1:TsinghuaX+20330334X_2015_2+sp',
  'C_course-v1:TsinghuaX+10430494X_2015_2+sp',
  'C_course-v1:TsinghuaX+20220053X_2015_T2+sp',
  'C_course-v1:TsinghuaX+00740043X_2015_T2+sp',
  'C_course-v1:CQU+MATH20041X+2019_T1'],
 ['C_course-v1:TsinghuaX+00690212X+sp',
  'C_course-v1:TsinghuaX+00670122X+2019_T1',
  'C_course-v1:Tsinghua+20150001+sp',
  'C_course-v1:UQx+Think101x+sp',
  'C_course-v1:TsinghuaX+00670122X+sp'],
 ['C_course-v1:TsinghuaX+10421075X_2015_2+sp',
  'C_course-v1:TsinghuaX+20220332X+sp',
  'C_course-v1:TsinghuaX+80240372X+

obtain the course sequence for each student and transform to list for further processing.

## Zero Padding

In [None]:
zero_pad_course_order = []

for i in course_order_list:
  zero_pad_course_order.append([0]*(10-len(i))+i)

In [None]:
zero_pad_course_order

[[0,
  'C_course-v1:TsinghuaX+00740043X_2015_T2+sp',
  'C_course-v1:TsinghuaX+10421084X_2015_2+sp',
  'C_course-v1:TsinghuaX+00690092X+sp',
  'C_course-v1:TsinghuaX+30240243X+sp',
  'C_course-v1:TsinghuaX+10430484X_2015_2+sp',
  'C_course-v1:TsinghuaX+20430064_2X+sp',
  'C_course-v1:TsinghuaX+10430494X_2015_2+sp',
  'C_course-v1:TsinghuaX+20430064X+sp',
  'C_course-v1:TsinghuaX+00740043_2x_2015_T2+sp'],
 [0,
  0,
  0,
  0,
  'C_course-v1:TsinghuaX+10430484X_2015_2+sp',
  'C_course-v1:TsinghuaX+20330334X_2015_2+sp',
  'C_course-v1:TsinghuaX+10430494X_2015_2+sp',
  'C_course-v1:TsinghuaX+20220053X_2015_T2+sp',
  'C_course-v1:TsinghuaX+00740043X_2015_T2+sp',
  'C_course-v1:CQU+MATH20041X+2019_T1'],
 [0,
  0,
  0,
  0,
  0,
  'C_course-v1:TsinghuaX+00690212X+sp',
  'C_course-v1:TsinghuaX+00670122X+2019_T1',
  'C_course-v1:Tsinghua+20150001+sp',
  'C_course-v1:UQx+Think101x+sp',
  'C_course-v1:TsinghuaX+00670122X+sp'],
 [0,
  0,
  'C_course-v1:TsinghuaX+10421075X_2015_2+sp',
  'C_course-v1:

Zero left pad to ensure all the data have similar length

## Check Sequence Length of the Data

In [None]:
list_check = []
for i in zero_pad_course_order:
  list_check.append(len(i))

In [None]:
array_check = np.array(list_check)
print(np.unique(array_check))

[10]


All data contained 10 sequence lengths.

## Course ID Gathering

### Extract all course ID in the dataset

In [None]:
course_all_data = pd.read_json('/content/drive/MyDrive/MOOCCube/entities/course.json', lines=True)
course_all_data

Unnamed: 0,id,name,prerequisites,about,core_id,video_order,display_name,chapter
0,C_course-v1:McGillX+ATOC185x+2015_T1,自然灾害（自主模式）,无,<p>地球上没有一处地方不发生自然灾害。当我们以科学的眼光看待这些自然灾害的原因和本质时，我...,C_course-v1:McGillX+ATOC185x+2015_T1,"[V_f6f710068b994452885b90e11b6ee5c5, V_7339568...","[Video: Overview 1, Video: Overview 2, Video: ...","[01.02.01.02, 01.02.03.02, 01.02.05.02, 01.02...."
1,C_course-v1:TsinghuaX+THESIS2015X+2015_T1,2015年清华大学研究生学位论文答辩（二）,无先修要求,<p>学位论文答辩环节是研究生培养的重要环节，为了充分发挥该环节的育人作用，搭建学术交流的平...,C_course-v1:TsinghuaX+THESIS2015X+2015_T1,"[V_de0371575a9f4b5391c89ad16d68b5c2, V_d632034...","[答辩陈述, 答辩陈述, 问答及答辩结果, 答辩陈述, 问答及答辩结果, 答辩陈述, 问答及...","[01.01.03.01, 01.02.03.01, 01.02.04.01, 01.03...."
2,C_course-v1:TsinghuaX+THESIS2014_1X_tv+_2014_,2014年清华大学研究生学位论文答辩（一）,无先修要求,<p>学位论文答辩环节是研究生培养的重要环节，为了充分发挥该环节的育人作用，搭建学术交流的平...,C_course-v1:TsinghuaX+THESIS2014_1X_tv+_2014_,"[V_d530be9cc0584317a16706684577a6dd, V_f329a62...","[论文答辩实况, 问答及答辩结果, 导师评价, 同学眼中的王鑫, 个人学术感言, 吴宇恩答辩...","[01.01.03.01, 01.01.04.01, 01.01.05.01, 01.01...."
3,C_course-v1:TsinghuaX+THESIS2015X_tv+2015_T1,2015年清华大学研究生学位论文答辩（一）,无先修要求,<p>学位论文答辩环节是研究生培养的重要环节，为了充分发挥该环节的育人作用，搭建学术交流的平...,C_course-v1:TsinghuaX+THESIS2015X+2015_T1,"[V_de0371575a9f4b5391c89ad16d68b5c2, V_78a8b41...","[答辩陈述, 问答及答辩结果, 答辩陈述, 问答及答辩结果, 答辩陈述, 问答及答辩结果, ...","[01.01.03.01, 01.01.04.01, 01.02.03.01, 01.02...."
4,C_course-v1:TsinghuaX+00690242+sp,文物精品与文化中国（自主模式）,无,<p>中国考古学是以往100年中发展最为迅速的领域之一，大批珍贵文物的出土，不断刷新人们对文...,C_course-v1:TsinghuaX+00690242+sp,"[V_d7dbd0fe8f504e7a91d863cd5a19b185, V_4492eca...","[文献所见原始渡河工具, 《禹贡》所见的水路交通, 绰墩山出土的渡河浮木, 舟船考古发掘, ...","[01.01.01.01, 01.01.02.01, 01.02.01.01, 01.03...."
...,...,...,...,...,...,...,...,...
701,C_course-v1:TW+2018052501X+2019_T3,STEM课程设计与案例分析（2019秋）,无,<p>\r\n\t《STEM课程设计与案例分析》课程，它是：<br />\r\n1.国内领先...,C_course-v1:TW+2018052501X+2019_T3,"[V_8f28c813854c444f98950e5404301f0b, V_f23a8ad...","[何为 STEM 教育, STEM 教育的起源与发展, 世界各国的 STEM 教育政策, 为...","[01.01.01.02, 01.02.01.01, 01.03.01.01, 01.04...."
702,C_course-v1:TsinghuaX+60700052X+2019_T2,数据科学导论（2019夏）,本课程适合各类学科背景学生学习,本课程作为数据科学的先导课和认知类课程，致力于以形象生动的教学模式为学生普及数据挖掘、大数据...,C_course-v1:TsinghuaX+60700052X+2019_T1,"[V_2aa0936927744c3b9f1dcc8b5bc30fba, V_2def333...","[Video, Video, Video, Video, Video, Video, Vid...","[01.01.01.01, 01.02.01.01, 01.03.01.01, 01.04...."
703,C_course-v1:TsinghuaX+80515182X+2019_sp,麦肯锡“全球领导力”自主模式,Basic knowledge of business administration,<p>\r\n\t脱胎于清华经管学院与麦肯锡公司联合开设的线下金牌课程，麦肯锡全球领导力在线...,C_course-v1:TsinghuaX+80515182X+2019_sp,"[V_ab97fe9d9a744958b886aa74d44146f8, V_8973856...","[Video, Video, Video, Video, Video, Video, Vid...","[01.01.01.01, 01.02.01.01, 01.03.01.01, 02.01...."
704,C_course-v1:NEU+2019012201X+2019_T2,物理化学（上）（2019夏）,高等数学,物理化学（上）是一门运用物理学手段和技术研究化学问题的学科，被誉为化学学科的“大脑”。本课程...,C_course-v1:NEU+2019012201X+2019_T2,"[V_6595d9e0894848348287b8abaa30dce6, V_4dc4a13...","[Video, Video, Video, Video, Video, Video, Vid...","[01.01.01.01, 02.01.01.01, 02.02.01.01, 02.03...."


In [None]:
all_course = [i for i in course_all_data['id']]

In [None]:
all_course

['C_course-v1:McGillX+ATOC185x+2015_T1',
 'C_course-v1:TsinghuaX+THESIS2015X+2015_T1',
 'C_course-v1:TsinghuaX+THESIS2014_1X_tv+_2014_',
 'C_course-v1:TsinghuaX+THESIS2015X_tv+2015_T1',
 'C_course-v1:TsinghuaX+00690242+sp',
 'C_course-v1:HNU+HNU001+sp',
 'C_course-v1:TsinghuaX+00720091X+sp',
 'C_course-v1:TsinghuaX+00612642X+sp',
 'C_course-v1:TsinghuaX+00691153X+sp',
 'C_course-v1:UQx+Think101x+sp',
 'C_course-v1:TsinghuaX+JWWCD001+2015_T2',
 'C_course-v1:TsinghuaX+70660542X+2015_T2',
 'C_course-v1:TsinghuaX+70120073X+sp',
 'C_course-v1:PekingX+Peking001x+_',
 'C_course-v1:UST+UST001+sp',
 'C_course-v1:SDUx+00931800X+sp',
 'C_course-v1:TsinghuaX+00690212X+sp',
 'C_course-v1:TsinghuaX+00680082X+sp',
 'C_course-v1:TsinghuaX+30260112X+sp',
 'C_course-v1:TsinghuaX+00310222X+sp',
 'C_course-v1:BIT+PHY1701702+sp',
 'C_course-v1:BIT+PHY1701701+sp',
 'C_course-v1:TsinghuaX+80515522X+sp',
 'C_course-v1:TsinghuaX+34000888X+sp',
 'C_course-v1:TsinghuaX+00690092X+sp',
 'C_course-v1:TsinghuaX+0069

Gather all courses in the dataset.

### Extract all course ID in the sample data

In [None]:
course_sample = [j for sub in course_order_list for j in sub]

In [None]:
course_sample

['C_course-v1:TsinghuaX+00740043X_2015_T2+sp',
 'C_course-v1:TsinghuaX+10421084X_2015_2+sp',
 'C_course-v1:TsinghuaX+00690092X+sp',
 'C_course-v1:TsinghuaX+30240243X+sp',
 'C_course-v1:TsinghuaX+10430484X_2015_2+sp',
 'C_course-v1:TsinghuaX+20430064_2X+sp',
 'C_course-v1:TsinghuaX+10430494X_2015_2+sp',
 'C_course-v1:TsinghuaX+20430064X+sp',
 'C_course-v1:TsinghuaX+00740043_2x_2015_T2+sp',
 'C_course-v1:TsinghuaX+10430484X_2015_2+sp',
 'C_course-v1:TsinghuaX+20330334X_2015_2+sp',
 'C_course-v1:TsinghuaX+10430494X_2015_2+sp',
 'C_course-v1:TsinghuaX+20220053X_2015_T2+sp',
 'C_course-v1:TsinghuaX+00740043X_2015_T2+sp',
 'C_course-v1:CQU+MATH20041X+2019_T1',
 'C_course-v1:TsinghuaX+00690212X+sp',
 'C_course-v1:TsinghuaX+00670122X+2019_T1',
 'C_course-v1:Tsinghua+20150001+sp',
 'C_course-v1:UQx+Think101x+sp',
 'C_course-v1:TsinghuaX+00670122X+sp',
 'C_course-v1:TsinghuaX+10421075X_2015_2+sp',
 'C_course-v1:TsinghuaX+20220332X+sp',
 'C_course-v1:TsinghuaX+80240372X+2019_T1',
 'C_course-v1:Ts

In [None]:
course_sample = np.unique(course_sample)

In [None]:
course_sample

array(['C_course-v1:ACCA+FA1_X+2019_T1', 'C_course-v1:ACCA+FA1_X+sp',
       'C_course-v1:ACCA+FA1_X_en+2019_T1',
       'C_course-v1:ACCA+FA2_X+2019_T1', 'C_course-v1:ACCA+FA2_X+sp',
       'C_course-v1:ACCA+FA2_X_en+2019_T1',
       'C_course-v1:ACCA+MA1_X+2019_T1', 'C_course-v1:ACCA+MA1_X+sp',
       'C_course-v1:ACCA+MA1_X_en+2019_T1',
       'C_course-v1:ACCA+MA2_X+2019_T1', 'C_course-v1:ACCA+MA2_X+sp',
       'C_course-v1:ACCA+MA2_X_en+2019_T1',
       'C_course-v1:AdelaideX+Wine101x+sp',
       'C_course-v1:AdelaideX+humbio101+sp',
       'C_course-v1:BFU+15002360+2019_T1',
       'C_course-v1:BFU+15012510+2019_T1',
       'C_course-v1:BFU+15023710+2019_T1',
       'C_course-v1:BFU+2018122709+2018_T2',
       'C_course-v1:BFU+2018122709+2019_T1',
       'C_course-v1:BFU+2018122710+2019_T1',
       'C_course-v1:BIFT+1301990078+2019_T1',
       'C_course-v1:BIFT+2018122901X+2018_T2',
       'C_course-v1:BIFT+2018122902X+2018_T2',
       'C_course-v1:BIT+100070018+2019_T1',
       

Gather all course in the data sample

In [None]:
len(course_sample)

660

There are 660 courses in the data sample.


## Course and Teacher Relation

In [None]:
teacher_course_relation = pd.read_csv('/content/drive/MyDrive/MOOCCube/relations/teacher-course.json', sep='\t', header=None)
teacher_course_relation

Unnamed: 0,0,1
0,T_方维奇,C_course-v1:SPI+20170828001x+sp
1,T_方维奇,C_course-v1:SXPI+20170828001x+2019_T1
2,T_范茂魁,C_course-v1:PSFFC+2018102405X+2018_T2
3,T_连小珉,C_course-v1:TsinghuaX+70150104_2X+2019_T1
4,T_连小珉,C_course-v1:TsinghuaX+70150104X+2019_T1
...,...,...
2344,T_徐君莉,C_course-v1:NEU+2018122401X+2019_T1
2345,T_薛庆,C_course-v1:BIT+100070018+2019_T2
2346,T_史静琤,C_course-v1:CSU+20180919X+2019_T1
2347,T_刘小冰,C_course-v1:TsinghuaX+80515522X+sp


In [None]:
teacher_course_relation.set_axis(['teacher', 'course'], axis='columns', inplace=True)

  teacher_course_relation.set_axis(['teacher', 'course'], axis='columns', inplace=True)


Set the columns name.

In [None]:
teacher_course_relation

Unnamed: 0,teacher,course
0,T_方维奇,C_course-v1:SPI+20170828001x+sp
1,T_方维奇,C_course-v1:SXPI+20170828001x+2019_T1
2,T_范茂魁,C_course-v1:PSFFC+2018102405X+2018_T2
3,T_连小珉,C_course-v1:TsinghuaX+70150104_2X+2019_T1
4,T_连小珉,C_course-v1:TsinghuaX+70150104X+2019_T1
...,...,...
2344,T_徐君莉,C_course-v1:NEU+2018122401X+2019_T1
2345,T_薛庆,C_course-v1:BIT+100070018+2019_T2
2346,T_史静琤,C_course-v1:CSU+20180919X+2019_T1
2347,T_刘小冰,C_course-v1:TsinghuaX+80515522X+sp


In [None]:
teacher_course_relation = teacher_course_relation.iloc[:,[1, 0]]

Change the position of each column.

In [None]:
teacher_course_relation = teacher_course_relation[teacher_course_relation['course'].isin(course_sample)]

Filter the dataframe where it contains courses only from the sample data.

In [None]:
teacher_course_relation

Unnamed: 0,course,teacher
0,C_course-v1:SPI+20170828001x+sp,T_方维奇
1,C_course-v1:SXPI+20170828001x+2019_T1,T_方维奇
2,C_course-v1:PSFFC+2018102405X+2018_T2,T_范茂魁
3,C_course-v1:TsinghuaX+70150104_2X+2019_T1,T_连小珉
4,C_course-v1:TsinghuaX+70150104X+2019_T1,T_连小珉
...,...,...
2344,C_course-v1:NEU+2018122401X+2019_T1,T_徐君莉
2345,C_course-v1:BIT+100070018+2019_T2,T_薛庆
2346,C_course-v1:CSU+20180919X+2019_T1,T_史静琤
2347,C_course-v1:TsinghuaX+80515522X+sp,T_刘小冰


In [None]:
len(np.unique(teacher_course_relation['course']))

651

A dataframe of course with their respective teacher is created. The dataframe consists only of courses from the sample data.

## Course and School Relation

In [None]:
school_course_relation = pd.read_csv('/content/drive/MyDrive/MOOCCube/relations/school-course.json', sep='\t', header=None)
school_course_relation

Unnamed: 0,0,1
0,S_BNU,C_course-v1:BNU+CSL21148501+2018_T2
1,S_BNU,C_course-v1:BNU+GE310141091+2019_T1
2,S_BNU,C_course-v1:BNU+2017112001X+2019_T1
3,S_BNU,C_course-v1:BNU+2018122603X+2018_T2
4,S_BNU,C_course-v1:BNU+CSL21126882+2019_T1
...,...,...
700,S_PSFFC,C_course-v1:PSFFC+2018102405X+2018_T2
701,S_PSFFC,C_course-v1:PSFFC+2018102403X+2018_T2
702,S_PSFFC,C_course-v1:PSFFC+2018102406X+2018_T2
703,S_PSFFC,C_course-v1:PSFFC+2018102404X+2018_T2


In [None]:
school_course_relation.set_axis(['school', 'course'], axis='columns', inplace=True)

  school_course_relation.set_axis(['school', 'course'], axis='columns', inplace=True)


Set the columns name.

In [None]:
school_course_relation

Unnamed: 0,school,course
0,S_BNU,C_course-v1:BNU+CSL21148501+2018_T2
1,S_BNU,C_course-v1:BNU+GE310141091+2019_T1
2,S_BNU,C_course-v1:BNU+2017112001X+2019_T1
3,S_BNU,C_course-v1:BNU+2018122603X+2018_T2
4,S_BNU,C_course-v1:BNU+CSL21126882+2019_T1
...,...,...
700,S_PSFFC,C_course-v1:PSFFC+2018102405X+2018_T2
701,S_PSFFC,C_course-v1:PSFFC+2018102403X+2018_T2
702,S_PSFFC,C_course-v1:PSFFC+2018102406X+2018_T2
703,S_PSFFC,C_course-v1:PSFFC+2018102404X+2018_T2


In [None]:
school_course_relation = school_course_relation.iloc[:,[1, 0]]

Change the position of school and course feature.

In [None]:
school_course_relation

Unnamed: 0,course,school
0,C_course-v1:BNU+CSL21148501+2018_T2,S_BNU
1,C_course-v1:BNU+GE310141091+2019_T1,S_BNU
2,C_course-v1:BNU+2017112001X+2019_T1,S_BNU
3,C_course-v1:BNU+2018122603X+2018_T2,S_BNU
4,C_course-v1:BNU+CSL21126882+2019_T1,S_BNU
...,...,...
700,C_course-v1:PSFFC+2018102405X+2018_T2,S_PSFFC
701,C_course-v1:PSFFC+2018102403X+2018_T2,S_PSFFC
702,C_course-v1:PSFFC+2018102406X+2018_T2,S_PSFFC
703,C_course-v1:PSFFC+2018102404X+2018_T2,S_PSFFC


In [None]:
school_course_relation = school_course_relation[school_course_relation['course'].isin(course_sample)]

Filter the dataframe where it contains courses only from the sample data.

In [None]:
school_course_relation

Unnamed: 0,course,school
0,C_course-v1:BNU+CSL21148501+2018_T2,S_BNU
1,C_course-v1:BNU+GE310141091+2019_T1,S_BNU
3,C_course-v1:BNU+2018122603X+2018_T2,S_BNU
4,C_course-v1:BNU+CSL21126882+2019_T1,S_BNU
5,C_course-v1:BNU+0610073991+2019_T1,S_BNU
...,...,...
699,C_course-v1:PSFFC+2018102402X+2018_T2,S_PSFFC
700,C_course-v1:PSFFC+2018102405X+2018_T2,S_PSFFC
701,C_course-v1:PSFFC+2018102403X+2018_T2,S_PSFFC
702,C_course-v1:PSFFC+2018102406X+2018_T2,S_PSFFC


A dataframe of course with their respective school is created. The dataframe consists only of courses from the sample data.

## Course and Concept Relation

In [None]:
course_concept_relation = pd.read_csv('/content/drive/MyDrive/MOOCCube/relations/course-concept.json', sep='\t', header=None)
course_concept_relation

Unnamed: 0,0,1
0,C_course-v1:KMUSTX+1803168+2019_T1,K_活性炭_化学
1,C_course-v1:KMUSTX+1803168+2019_T1,K_内切_数学
2,C_course-v1:KMUSTX+1803168+2019_T1,K_缺陷_管理科学技术
3,C_course-v1:KMUSTX+1803168+2019_T1,K_氨基酸_化学
4,C_course-v1:KMUSTX+1803168+2019_T1,K_寡核苷酸_化学
...,...,...
167746,C_course-v1:TsinghuaX+90640012X+sp,K_奖金_管理科学技术
167747,C_course-v1:TsinghuaX+90640012X+sp,K_嗅觉_心理学
167748,C_course-v1:TsinghuaX+90640012X+sp,K_使用者_管理科学技术
167749,C_course-v1:TsinghuaX+90640012X+sp,K_股票_管理科学技术


In [None]:
course_concept_relation.set_axis(['course', 'concept'], axis='columns', inplace=True)

  course_concept_relation.set_axis(['course', 'concept'], axis='columns', inplace=True)


Set the columns name

In [None]:
course_concept_relation

Unnamed: 0,course,concept
0,C_course-v1:KMUSTX+1803168+2019_T1,K_活性炭_化学
1,C_course-v1:KMUSTX+1803168+2019_T1,K_内切_数学
2,C_course-v1:KMUSTX+1803168+2019_T1,K_缺陷_管理科学技术
3,C_course-v1:KMUSTX+1803168+2019_T1,K_氨基酸_化学
4,C_course-v1:KMUSTX+1803168+2019_T1,K_寡核苷酸_化学
...,...,...
167746,C_course-v1:TsinghuaX+90640012X+sp,K_奖金_管理科学技术
167747,C_course-v1:TsinghuaX+90640012X+sp,K_嗅觉_心理学
167748,C_course-v1:TsinghuaX+90640012X+sp,K_使用者_管理科学技术
167749,C_course-v1:TsinghuaX+90640012X+sp,K_股票_管理科学技术


In [None]:
course_concept_relation = course_concept_relation[course_concept_relation['course'].isin(course_sample)]

Filter the dataframe where it contains courses only from the sample data.

In [None]:
course_concept_relation

Unnamed: 0,course,concept
0,C_course-v1:KMUSTX+1803168+2019_T1,K_活性炭_化学
1,C_course-v1:KMUSTX+1803168+2019_T1,K_内切_数学
2,C_course-v1:KMUSTX+1803168+2019_T1,K_缺陷_管理科学技术
3,C_course-v1:KMUSTX+1803168+2019_T1,K_氨基酸_化学
4,C_course-v1:KMUSTX+1803168+2019_T1,K_寡核苷酸_化学
...,...,...
167746,C_course-v1:TsinghuaX+90640012X+sp,K_奖金_管理科学技术
167747,C_course-v1:TsinghuaX+90640012X+sp,K_嗅觉_心理学
167748,C_course-v1:TsinghuaX+90640012X+sp,K_使用者_管理科学技术
167749,C_course-v1:TsinghuaX+90640012X+sp,K_股票_管理科学技术


In [None]:
len(np.unique(course_concept_relation['concept']))

24957

In [None]:
len(np.unique(course_concept_relation['course']))

646

A dataframe of course with their respective concept is created.

## Concept and Prerequisite Relation

In [None]:
concept_prerequisite_relation = pd.read_csv('/content/drive/MyDrive/MOOCCube/relations/prerequisite-dependency.json', sep='\t', header=None)
concept_prerequisite_relation

Unnamed: 0,0,1
0,K_计算机科学_计算机科学技术,K_服务器_计算机科学技术
1,K_服务数据单元_计算机科学技术,K_缓存_计算机科学技术
2,K_代数_数学,K_对偶定理_数学
3,K_乘法_数学,K_函数_数学
4,K_加法_数学,K_关联矩阵_数学
...,...,...
1022,K_参数_数学,K_微分方程_数学
1023,K_程序设计_计算机科学技术,K_计算效率_数学
1024,K_算术_数学,K_余数_数学
1025,K_计算机技术_计算机科学技术,K_地址解析_计算机科学技术


In [None]:
concept_prerequisite_relation.set_axis(['prerequisite concept', 'concept'], axis='columns', inplace=True)

  concept_prerequisite_relation.set_axis(['prerequisite concept', 'concept'], axis='columns', inplace=True)


Set the columns name.

In [None]:
concept_prerequisite_relation

Unnamed: 0,prerequisite concept,concept
0,K_计算机科学_计算机科学技术,K_服务器_计算机科学技术
1,K_服务数据单元_计算机科学技术,K_缓存_计算机科学技术
2,K_代数_数学,K_对偶定理_数学
3,K_乘法_数学,K_函数_数学
4,K_加法_数学,K_关联矩阵_数学
...,...,...
1022,K_参数_数学,K_微分方程_数学
1023,K_程序设计_计算机科学技术,K_计算效率_数学
1024,K_算术_数学,K_余数_数学
1025,K_计算机技术_计算机科学技术,K_地址解析_计算机科学技术


In [None]:
concept_prerequisite_relation = concept_prerequisite_relation[concept_prerequisite_relation['concept'].isin(course_concept_relation['concept'])]

Filter the dataframe where it contains courses only from the sample data.

In [None]:
concept_prerequisite_relation

Unnamed: 0,prerequisite concept,concept
0,K_计算机科学_计算机科学技术,K_服务器_计算机科学技术
1,K_服务数据单元_计算机科学技术,K_缓存_计算机科学技术
2,K_代数_数学,K_对偶定理_数学
3,K_乘法_数学,K_函数_数学
4,K_加法_数学,K_关联矩阵_数学
...,...,...
1022,K_参数_数学,K_微分方程_数学
1023,K_程序设计_计算机科学技术,K_计算效率_数学
1024,K_算术_数学,K_余数_数学
1025,K_计算机技术_计算机科学技术,K_地址解析_计算机科学技术


In [None]:
len(np.unique(concept_prerequisite_relation['concept']))

355

In [None]:
len(np.unique(concept_prerequisite_relation['prerequisite concept']))

245

A dataframe of course with their respective school is created. The dataframe consists only of courses from the sample data.

### Concept Gathering

In [None]:
concept = []

for i in concept_prerequisite_relation['concept']:
  concept.append(i)

for i in concept_prerequisite_relation['prerequisite concept']:
  concept.append(i)

Gather all concept from "concept" and "prerequisite concept"

In [None]:
len(concept)

2026

In [None]:
concept = np.unique(concept)
len(concept)

416

In [None]:
course_concept_relation_concept_filter = course_concept_relation[course_concept_relation['concept'].isin(concept)]

Filter the dataframe where it contains courses only from the sample data.

In [None]:
course_concept_relation_concept_filter

Unnamed: 0,course,concept
6,C_course-v1:KMUSTX+1803168+2019_T1,K_容量_数学
19,C_course-v1:KMUSTX+1803168+2019_T1,K_必要条件_数学
25,C_course-v1:KMUSTX+1803168+2019_T1,K_数量级_数学
28,C_course-v1:KMUSTX+1803168+2019_T1,K_配对_数学
129,C_course-v1:KMUSTX+1803168+2019_T1,K_计算方法_数学
...,...,...
166628,C_course-v1:JNUX+2018120406X+2018_T2,K_量化_数学
166639,C_course-v1:JNUX+2018120406X+2018_T2,K_负整数_数学
166647,C_course-v1:JNUX+2018120406X+2018_T2,K_有效性_数学
166662,C_course-v1:JNUX+2018120406X+2018_T2,K_软件_计算机科学技术


In [None]:
len(np.unique(course_concept_relation_concept_filter['concept']))

414

In [None]:
len(np.unique(course_concept_relation_concept_filter['course']))

421

In [None]:
course_with_prerequisite_concept = pd.merge(course_concept_relation_concept_filter, concept_prerequisite_relation,  how = "left", on = 'concept')

Merge two dataframes of "course and concept relation" and "concept and prerequisite concept relation".

In [None]:
course_with_prerequisite_concept

Unnamed: 0,course,concept,prerequisite concept
0,C_course-v1:KMUSTX+1803168+2019_T1,K_容量_数学,
1,C_course-v1:KMUSTX+1803168+2019_T1,K_必要条件_数学,
2,C_course-v1:KMUSTX+1803168+2019_T1,K_数量级_数学,
3,C_course-v1:KMUSTX+1803168+2019_T1,K_配对_数学,K_数学_数学
4,C_course-v1:KMUSTX+1803168+2019_T1,K_计算方法_数学,K_运算_数学
...,...,...,...
27787,C_course-v1:JNUX+2018120406X+2018_T2,K_负整数_数学,K_数学_数学
27788,C_course-v1:JNUX+2018120406X+2018_T2,K_负整数_数学,K_正数_数学
27789,C_course-v1:JNUX+2018120406X+2018_T2,K_有效性_数学,K_反例_数学
27790,C_course-v1:JNUX+2018120406X+2018_T2,K_软件_计算机科学技术,K_输入输出_计算机科学技术


In [None]:
course_with_prerequisite_concept.fillna("0", inplace=True)

Change data with "NaN" value to "0".

In [None]:
len(np.unique(course_with_prerequisite_concept['course']))

421

In [None]:
len(np.unique(course_with_prerequisite_concept['concept']))

414

In [None]:
course_with_prerequisite_concept

Unnamed: 0,course,concept,prerequisite concept
0,C_course-v1:KMUSTX+1803168+2019_T1,K_容量_数学,0
1,C_course-v1:KMUSTX+1803168+2019_T1,K_必要条件_数学,0
2,C_course-v1:KMUSTX+1803168+2019_T1,K_数量级_数学,0
3,C_course-v1:KMUSTX+1803168+2019_T1,K_配对_数学,K_数学_数学
4,C_course-v1:KMUSTX+1803168+2019_T1,K_计算方法_数学,K_运算_数学
...,...,...,...
27787,C_course-v1:JNUX+2018120406X+2018_T2,K_负整数_数学,K_数学_数学
27788,C_course-v1:JNUX+2018120406X+2018_T2,K_负整数_数学,K_正数_数学
27789,C_course-v1:JNUX+2018120406X+2018_T2,K_有效性_数学,K_反例_数学
27790,C_course-v1:JNUX+2018120406X+2018_T2,K_软件_计算机科学技术,K_输入输出_计算机科学技术


In [None]:
len(np.unique(course_with_prerequisite_concept['prerequisite concept']))

246

In [None]:
course_prerequisite = np.unique(course_with_prerequisite_concept['course'])

Remove duplicated data.

In [None]:
course_with_prerequisite_concept.fillna(0, inplace=True)

Change data with "NaN" value to "0".

## Course Sequence Dataframe Construction

In [None]:
course_columns = ['course_1', 'course_2', 'course_3', 'course_4', 'course_5', 'course_6', 'course_7', 'course_8', 'course_9', 'course_10']

Named the columns

In [None]:
data_course = pd.DataFrame(zero_pad_course_order, columns = course_columns, dtype = str)

In [None]:
data_course

Unnamed: 0,course_1,course_2,course_3,course_4,course_5,course_6,course_7,course_8,course_9,course_10
0,0,C_course-v1:TsinghuaX+00740043X_2015_T2+sp,C_course-v1:TsinghuaX+10421084X_2015_2+sp,C_course-v1:TsinghuaX+00690092X+sp,C_course-v1:TsinghuaX+30240243X+sp,C_course-v1:TsinghuaX+10430484X_2015_2+sp,C_course-v1:TsinghuaX+20430064_2X+sp,C_course-v1:TsinghuaX+10430494X_2015_2+sp,C_course-v1:TsinghuaX+20430064X+sp,C_course-v1:TsinghuaX+00740043_2x_2015_T2+sp
1,0,0,0,0,C_course-v1:TsinghuaX+10430484X_2015_2+sp,C_course-v1:TsinghuaX+20330334X_2015_2+sp,C_course-v1:TsinghuaX+10430494X_2015_2+sp,C_course-v1:TsinghuaX+20220053X_2015_T2+sp,C_course-v1:TsinghuaX+00740043X_2015_T2+sp,C_course-v1:CQU+MATH20041X+2019_T1
2,0,0,0,0,0,C_course-v1:TsinghuaX+00690212X+sp,C_course-v1:TsinghuaX+00670122X+2019_T1,C_course-v1:Tsinghua+20150001+sp,C_course-v1:UQx+Think101x+sp,C_course-v1:TsinghuaX+00670122X+sp
3,0,0,C_course-v1:TsinghuaX+10421075X_2015_2+sp,C_course-v1:TsinghuaX+20220332X+sp,C_course-v1:TsinghuaX+80240372X+2019_T1,C_course-v1:TsinghuaX+10421094X_2015_2+sp,C_course-v1:RiceX+AdvBIO_1x+sp,C_course-v1:TsinghuaX+10430484X_2015_2+sp,C_course-v1:TsinghuaX+10430494X_2015_2+sp,C_course-v1:TsinghuaX+10421084X_2015_2+sp
4,0,0,0,0,0,C_course-v1:TsinghuaX+40050444X+sp,C_course-v1:XIYOU+20180208+sp,C_course-v1:TsinghuaX+30240184+sp,C_course-v1:UQx+BIOIMG101x+sp,C_course-v1:SCUT+145033+sp
...,...,...,...,...,...,...,...,...,...,...
8956,0,0,C_course-v1:TsinghuaX+Pr20170406-SC_p1+sp,C_course-v1:TsinghuaX+20250103X+sp,C_course-v1:TsinghuaX+81020142X+sp,C_course-v1:TsinghuaX+30260112X+sp,C_course-v1:NJU+C1026+2016_T2,C_course-v1:TsinghuaX+00691153X+sp,C_course-v1:TsinghuaX+60240202X+sp,C_course-v1:TsinghuaX+80000901X_2+sp
8957,0,0,0,0,C_course-v1:TsinghuaX+30700313X+sp,C_course-v1:TsinghuaX+AP000016X+2019_T1,C_course-v1:MITx+15_390_2x+sp,C_course-v1:MITx+6_00_1x+sp,C_course-v1:TsinghuaX+00740043X_2015_T2+sp,C_course-v1:UQx+Think101x+sp
8958,0,0,0,0,C_course-v1:CUNY+20171211001+sp,C_course-v1:WageningenX+NUTR101x+sp,C_course-v1:TsinghuaX+2018032801X+2018_T1,C_course-v1:MITx+6_00_1x+sp,C_course-v1:RiceX+RELI157x+sp,C_course-v1:TsinghuaX+70000662+2019_T1
8959,0,C_course-v1:TsinghuaX+70250023X_2015_2+sp,C_course-v1:WUT+1022817X+sp,C_course-v1:MITx+15_390_2x+sp,C_course-v1:TsinghuaX+70660542X+2015_T2,C_course-v1:TsinghuaX+00690342X+sp,C_course-v1:TsinghuaX+60240202X+sp,C_course-v1:FudanX+SOSC120007+sp,C_course-v1:MITx+6_00_1x+sp,C_course-v1:TsinghuaX+THESIS2015X+2015_T1


The course data consists of 10 columns from the first course until the final course order for each student (number or row represent number of stundent).

## School Sequence Dataframe Construction

In [None]:
data_school = data_course.copy()

In [None]:
dict_lookup_course_school = dict(zip(school_course_relation['course'], school_course_relation['school']))
dict_lookup_course_school

{'C_course-v1:BNU+CSL21148501+2018_T2': 'S_BNU',
 'C_course-v1:BNU+GE310141091+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018122603X+2018_T2': 'S_BNU',
 'C_course-v1:BNU+CSL21126882+2019_T1': 'S_BNU',
 'C_course-v1:BNU+0610073991+2019_T1': 'S_BNU',
 'C_course-v1:BNU+1010070372+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2017053101X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+GE410081071-01+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018122602X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018091301X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018091302X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018091303X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+PHI2107404101+2019_T1': 'S_BNU',
 'C_course-v1:BNU+0610073981+2019_T1': 'S_BNU',
 'C_course-v1:BNU+0210021441+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2017071001X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018091305X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018011801X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+ENV13018+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2017001+2018_T2': 'S_BNU',
 'C_course-v1:BNU+GOV21089

In [None]:
dict_lookup_course_school['0'] = '0'

In [None]:
len(dict_lookup_course_school)

660

In [None]:
dict_lookup_course_school['C_course-v1:TsinghuaX+AP000003X+2019_T1'] = 'S_TsinghuaX'

Some of the courses did not have a school name. So add a school name for the course without the school name using the majority of school names in the data.

The dictionary lookup is constructed.

In [None]:
for i in course_columns:
  data_school[i] = [dict_lookup_course_school[item] for item in data_school[i]]

Encode the course name with the school name.

In [None]:
school_columns = ['school_1', 'school_2', 'school_3', 'school_4', 'school_5', 'school_6', 'school_7', 'school_8', 'school_9', 'school_10']

In [None]:
data_school.columns = school_columns

Named the columns

In [None]:
data_school

Unnamed: 0,school_1,school_2,school_3,school_4,school_5,school_6,school_7,school_8,school_9,school_10
0,0,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX
1,0,0,0,0,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_cqu
2,0,0,0,0,0,S_TsinghuaX,S_TsinghuaX,S_TSINGHUA,S_UQx,S_TsinghuaX
3,0,0,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_RiceX,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX
4,0,0,0,0,0,S_TsinghuaX,S_XIYOU,S_TsinghuaX,S_UQx,S_SCUT
...,...,...,...,...,...,...,...,...,...,...
8956,0,0,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_NJU,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX
8957,0,0,0,0,S_TsinghuaX,S_TsinghuaX,S_MITx,S_MITx,S_TsinghuaX,S_UQx
8958,0,0,0,0,S_CUNY,S_WageningenX,S_TsinghuaX,S_MITx,S_RiceX,S_TsinghuaX
8959,0,S_TsinghuaX,S_WUT,S_MITx,S_TsinghuaX,S_TsinghuaX,S_TsinghuaX,S_FUDANx,S_MITx,S_TsinghuaX


The school data consists of 10 columns from the first school until the final school order for each student (number or row represent number of stundent).

## Teacher Sequence Dataframe Construction

In [None]:
data_teacher = data_course.copy()

In [None]:
dict_lookup_course_teacher = dict(zip(teacher_course_relation['course'], teacher_course_relation['teacher']))
dict_lookup_course_teacher

{'C_course-v1:SPI+20170828001x+sp': 'T_姚常青',
 'C_course-v1:SXPI+20170828001x+2019_T1': 'T_姚常青',
 'C_course-v1:PSFFC+2018102405X+2018_T2': 'T_赵石楠',
 'C_course-v1:TsinghuaX+70150104_2X+2019_T1': 'T_连小珉',
 'C_course-v1:TsinghuaX+70150104X+2019_T1': 'T_连小珉',
 'C_course-v1:TJU+2010241X+sp': 'T_张德顺',
 'C_course-v1:TsinghuaX+20250103X+sp': 'T_任艳频',
 'C_course-v1:KMUSTX+8219011+2019_T1': 'T_王裕森',
 'C_course-v1:BIFT+2018122901X+2018_T2': 'T_丁雅琼',
 'C_course-v1:PSFFC+2018102403X+2018_T2': 'T_陶昆',
 'C_course-v1:BUCM+2018122604X+2018_T2': 'T_周芬',
 'C_course-v1:BUCM+2018122604X+2019_T1': 'T_周芬',
 'C_course-v1:TsinghuaX+20180919X+2019_T1': 'T_靳卫萍',
 'C_course-v1:SCUT+145033+sp': 'T_杨俊荣',
 'C_course-v1:BNU+2018091303X+2019_T1': 'T_孙璞',
 'C_course-v1:BNU+2018091304X+2019_T1': 'T_杨兆春',
 'C_course-v1:HIT+13SC20301820+2019_T1': 'T_孙洁',
 'C_course-v1:TsinghuaX+40050455_2X+sp': 'T_黄霞',
 'C_course-v1:nxu+2018122711+2018_T2': 'T_段玉泉',
 'C_course-v1:TsinghuaX+60610231+2016_T2_SP': 'T_王  前',
 'C_course-v1:Tsin

In [None]:
dict_lookup_course_teacher['0'] = '0'

In [None]:
len(dict_lookup_course_teacher)

652

In [None]:
list(set(list(teacher_course_relation['course'])).symmetric_difference(set(course_sample)))

['C_course-v1:HBNU+2019051509X+2019_T1',
 'C_course-v1:TsinghuaX+AP000003X+2019_T1',
 'C_course-v1:TsinghuaX+70167012X+sp',
 'C_course-v1:SYJU+030020406+2019_T1',
 'C_course-v1:Tsinghua+20150001+sp',
 'C_course-v1:WellesleyX+HIST229x+sp',
 'C_course-v1:JSUX+2017011101X+sp',
 'C_course-v1:Tsinghua+Thesis2017X+2017_T1',
 'C_course-v1:Tsinghua+20181011X+2018_T2']

Check the difference between the courses in the "course and teacher relation" dataframe and course sample. All the course above represent a course without teacher name.

In [None]:
dict_lookup_course_teacher['C_course-v1:Tsinghua+Thesis2017X+2017_T1'] = 'T_研究生院'
dict_lookup_course_teacher['C_course-v1:TsinghuaX+AP000003X+2019_T1'] = 'T_李强'
dict_lookup_course_teacher['C_course-v1:Tsinghua+20150001+sp'] = 'T_程新兵'
dict_lookup_course_teacher['C_course-v1:Tsinghua+20181011X+2018_T2'] = 'T_谢德军'
dict_lookup_course_teacher['C_course-v1:SYJU+030020406+2019_T1'] = 'T_姜桂荣'
dict_lookup_course_teacher['C_course-v1:TsinghuaX+70167012X+sp'] = 'T_朱颖心'
dict_lookup_course_teacher['C_course-v1:WellesleyX+HIST229x+sp'] = 'T_Adam Van Arsdale'
dict_lookup_course_teacher['C_course-v1:JSUX+2017011101X+sp'] = 'T_李芳宇'
dict_lookup_course_teacher['C_course-v1:HBNU+2019051509X+2019_T1'] = 'T_于丹'

Add a teacher name for the course without the teacher name using a similar teacher from the same school.

The dictionary lookup is constructed.

In [None]:
for i in course_columns:
  data_teacher[i] = [dict_lookup_course_teacher[item] for item in data_teacher[i]]

Encode the course name with the school name.

In [None]:
teacher_columns = ['teacher_1', 'teacher_2', 'teacher_3', 'teacher_4', 'teacher_5', 'teacher_6', 'teacher_7', 'teacher_8', 'teacher_9', 'teacher_10']

Named the columns

In [None]:
data_teacher.columns = teacher_columns

In [None]:
data_teacher

Unnamed: 0,teacher_1,teacher_2,teacher_3,teacher_4,teacher_5,teacher_6,teacher_7,teacher_8,teacher_9,teacher_10
0,0,T_郑莉,T_扈志明,T_陈为蓬,T_向勇,T_安宇,T_吕嵘,T_安宇,T_吕嵘,T_郑莉
1,0,0,0,0,T_安宇,T_高云峰,T_安宇,T_邢广军,T_郑莉,T_胥斌
2,0,0,0,0,0,T_彭林,T_清花道,T_程新兵,T_Matthew Thompson,T_清花道
3,0,0,T_扈志明,T_宫崎泉,T_袁博,T_马辉,T_Reid Whitaker,T_安宇,T_安宇,T_扈志明
4,0,0,0,0,0,T_吴烨,T_王曙燕,T_邓俊辉,T_Karine Mardon,T_杨俊荣
...,...,...,...,...,...,...,...,...,...,...
8956,0,0,T_曾鸣,T_任艳频,T_唐仙,T_吴华强,T_骆斌,T_孙晶,T_谢德军,T_王贵祥
8957,0,0,0,0,T_彭凯平,T_戴俊彪,T_Bill Aulet,T_John Guttag,T_郑莉,T_Matthew Thompson
8958,0,0,0,0,T_纪平,T_Sander Kersten,T_小程序慕课讲师,T_John Guttag,T_Bernard Freeman,T_龙瀛
8959,0,T_清华大学自动化系控制课组,T_李琳,T_Bill Aulet,T_王振民,T_王晓朝,T_谢德军,T_田素华,T_John Guttag,T_研究生院


The school data consists of 10 columns from the first school until the final school order for each student (number or row represent number of stundent).

## Course Concept & Course Prerequisite Concept Sequence Dataframe Construction

In [None]:
data_concept = data_course.copy()
data_prerequisite_concept = data_course.copy()

In [None]:
dict_lookup_course_concept = {}

for i in course_prerequisite:
  dict_lookup_course_concept[i] = list(course_with_prerequisite_concept[course_with_prerequisite_concept['course'] == i]['concept'])

Create dictionary lookup for course and concept.

In [None]:
course_with_prerequisite_concept[course_with_prerequisite_concept['course'] == 'C_course-v1:xuetangX+MOOC102+2019_T1']['concept']

23443      K_计算机_计算机科学技术
23444      K_服务器_计算机科学技术
23445      K_服务器_计算机科学技术
23446      K_服务器_计算机科学技术
23447      K_服务器_计算机科学技术
23448      K_服务器_计算机科学技术
23449      K_结构图_计算机科学技术
23450       K_方面_计算机科学技术
23451       K_方面_计算机科学技术
23452       K_孩子_计算机科学技术
23453       K_孩子_计算机科学技术
23454       K_孩子_计算机科学技术
23455      K_大数据_计算机科学技术
23456      K_大数据_计算机科学技术
23457       K_更新_计算机科学技术
23458       K_更新_计算机科学技术
23459    K_计算机科学_计算机科学技术
23460       K_数组_计算机科学技术
23461       K_兄弟_计算机科学技术
23462       K_兄弟_计算机科学技术
23463     K_操作系统_计算机科学技术
23464       K_退出_计算机科学技术
23465       K_森林_计算机科学技术
23466       K_森林_计算机科学技术
23467       K_软件_计算机科学技术
23468      K_互联网_计算机科学技术
23469      K_互联网_计算机科学技术
23470      K_互联网_计算机科学技术
23471      K_互联网_计算机科学技术
Name: concept, dtype: object

In [None]:
dict_lookup_course_concept['C_course-v1:xuetangX+MOOC102+2019_T1']

['K_计算机_计算机科学技术',
 'K_服务器_计算机科学技术',
 'K_服务器_计算机科学技术',
 'K_服务器_计算机科学技术',
 'K_服务器_计算机科学技术',
 'K_服务器_计算机科学技术',
 'K_结构图_计算机科学技术',
 'K_方面_计算机科学技术',
 'K_方面_计算机科学技术',
 'K_孩子_计算机科学技术',
 'K_孩子_计算机科学技术',
 'K_孩子_计算机科学技术',
 'K_大数据_计算机科学技术',
 'K_大数据_计算机科学技术',
 'K_更新_计算机科学技术',
 'K_更新_计算机科学技术',
 'K_计算机科学_计算机科学技术',
 'K_数组_计算机科学技术',
 'K_兄弟_计算机科学技术',
 'K_兄弟_计算机科学技术',
 'K_操作系统_计算机科学技术',
 'K_退出_计算机科学技术',
 'K_森林_计算机科学技术',
 'K_森林_计算机科学技术',
 'K_软件_计算机科学技术',
 'K_互联网_计算机科学技术',
 'K_互联网_计算机科学技术',
 'K_互联网_计算机科学技术',
 'K_互联网_计算机科学技术']

check whether the dictionary contains the correct data

In [None]:
dict_lookup_course_prerequisite = {}

for i in course_prerequisite:
  dict_lookup_course_prerequisite[i] = list(course_with_prerequisite_concept[course_with_prerequisite_concept['course'] == i]['prerequisite concept'])

Create dictionary lookup for course and prerequisite concept.

In [None]:
course_prerequisite_diff = list(set(list(course_with_prerequisite_concept['course'])).symmetric_difference(set(course_sample)))

In [None]:
course_prerequisite_diff

['C_course-v1:BNU+2018122601X+2019_T1',
 'C_course-v1:RiceX+Phys102x+sp',
 'C_course-v1:HUBU+2017022703X+sp',
 'C_course-v1:TsinghuaX+00680142X+2019_T1',
 'C_course-v1:LZU+20171113+SP',
 'C_course-v1:TsinghuaX+00990021X+2019_T1',
 'C_course-v1:JNU+077901mc08+2018_T3',
 'C_course-v1:SichuanU+106588020+sp',
 'C_course-v1:qhnu+20181212x+2019_T1',
 'C_course-v1:SCUEC+0171229001+sp',
 'C_course-v1:TsinghuaX+30260112X+sp',
 'C_course-v1:TsinghuaX+00612642X+2019_T1',
 'C_course-v1:TsinghuaX+AP000008X+2019_T1',
 'C_course-v1:NU+2017081001X+sp',
 'C_course-v1:BSU+2018122403X+2019_T1',
 'C_course-v1:TsinghuaX+10610193X+2019_T1',
 'C_course-v1:BNU+0610073991+2019_T1',
 'C_course-v1:CUC+119511+2019_T1',
 'C_course-v1:TsinghuaX+30640014X+2019_T1',
 'C_course-v1:TsinghuaX+34000312X+2019_T1',
 'C_course-v1:YAU+20181108X+2019_T1',
 'C_course-v1:TsinghuaX+20180131001+2019_T1',
 'C_course-v1:TsinghuaX+00510663X+2019_T1',
 'C_course-v1:TsinghuaX+10800032X+sp',
 'C_course-v1:gztrc+2018122601+2019_T1',
 'C

Check the difference between the courses in the "course and prerequisite concept relation" dataframe and course sample. All the course above represent a course without prerequisite concepts.

In [None]:
dict_lookup_course_prerequisite

{'C_course-v1:ACCA+FA1_X+2019_T1': ['K_运算_数学',
  'K_代数_数学',
  'K_一元运算_数学',
  'K_拟合_数学',
  'K_数理逻辑_数学'],
 'C_course-v1:ACCA+FA1_X+sp': ['K_运算_数学',
  'K_代数_数学',
  'K_一元运算_数学',
  'K_拟合_数学',
  'K_数理逻辑_数学'],
 'C_course-v1:ACCA+FA1_X_en+2019_T1': ['K_运算_数学',
  'K_代数_数学',
  'K_一元运算_数学',
  'K_拟合_数学',
  'K_数理逻辑_数学'],
 'C_course-v1:ACCA+FA2_X+2019_T1': ['K_多项式_数学',
  'K_相等_数学',
  'K_运算_数学',
  'K_运算_数学',
  'K_多项式_数学',
  'K_运算_数学',
  'K_代数_数学',
  'K_一元运算_数学',
  'K_拟合_数学',
  'K_等于_数学',
  'K_代数_数学',
  'K_计算_数学',
  'K_计算_数学',
  'K_运算_数学',
  'K_约束_数学',
  'K_运算_数学',
  'K_代数的_数学',
  '0'],
 'C_course-v1:ACCA+FA2_X+sp': ['K_多项式_数学',
  'K_相等_数学',
  'K_运算_数学',
  'K_运算_数学',
  'K_多项式_数学',
  'K_运算_数学',
  'K_代数_数学',
  'K_一元运算_数学',
  'K_拟合_数学',
  'K_等于_数学',
  'K_代数_数学',
  'K_计算_数学',
  'K_计算_数学',
  'K_运算_数学',
  'K_约束_数学',
  'K_运算_数学',
  'K_代数的_数学',
  '0'],
 'C_course-v1:ACCA+FA2_X_en+2019_T1': ['K_多项式_数学',
  'K_相等_数学',
  'K_运算_数学',
  'K_运算_数学',
  'K_多项式_数学',
  'K_等于_数学',
  'K_代数_数学',
  'K_计算_数学',
  'K_计算_数学',
  '

In [None]:
for i,j in dict_lookup_course_prerequisite.items():
  dict_lookup_course_prerequisite[i] = list(filter(lambda num: num != '0', j))

In [None]:
for i in course_prerequisite_diff:
  dict_lookup_course_concept[i] = ['0']
  dict_lookup_course_prerequisite[i] = ['0']

For course without concept name and prerequisite concept name, add the value zero to the concept and prerequisite concept.

In [None]:
for i in course_prerequisite_diff:
  print(dict_lookup_course_concept[i])

['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0']
['0'

In [None]:
dict_lookup_course_concept['0'] = ['0']
dict_lookup_course_prerequisite['0'] = ['0']

In [None]:
for i in course_columns:
  data_concept[i] = [dict_lookup_course_concept[item] for item in data_concept[i]]
  data_prerequisite_concept[i] = [dict_lookup_course_prerequisite[item] for item in data_prerequisite_concept[i]]

Encode the course name with the concept name and prerequisite concept name.

In [None]:
concept_columns = ['concept_1', 'concept_2', 'concept_3', 'concept_4', 'concept_5', 'concept_6', 'concept_7', 'concept_8', 'concept_9', 'concept_10']
prerequisite_columns = ['prerequisite_1', 'prerequisite_2', 'prerequisite_3', 'prerequisite_4', 'prerequisite_5', 'prerequisite_6', 'prerequisite_7', 'prerequisite_8', 'prerequisite_9', 'prerequisite_10']

In [None]:
data_concept.columns = concept_columns
data_prerequisite_concept.columns = prerequisite_columns

Named the columns

In [None]:
data_concept

Unnamed: 0,concept_1,concept_2,concept_3,concept_4,concept_5,concept_6,concept_7,concept_8,concept_9,concept_10
0,[0],"[K_任务_计算机科学技术, K_除法_数学, K_除法_数学, K_除法_数学, K_除法...","[K_指数_数学, K_指数_数学, K_指数_数学, K_闭区间_数学, K_闭区间_数学...","[K_除法_数学, K_除法_数学, K_除法_数学, K_除法_数学, K_除法_数学, ...","[K_信号处理_计算机科学技术, K_实例_计算机科学技术, K_实例_计算机科学技术, K...","[K_计算效率_数学, K_计算效率_数学, K_计算效率_数学, K_计算效率_数学, K...","[K_代数_数学, K_正整数_数学, K_正整数_数学, K_正整数_数学, K_正整数_...","[K_加法_数学, K_加法_数学, K_加法_数学, K_加法_数学, K_加法_数学, ...","[K_指数_数学, K_指数_数学, K_指数_数学, K_除法_数学, K_除法_数学, ...","[K_队头_计算机科学技术, K_队头_计算机科学技术, K_队头_计算机科学技术, K_队..."
1,[0],[0],[0],[0],"[K_计算效率_数学, K_计算效率_数学, K_计算效率_数学, K_计算效率_数学, K...","[K_指数_数学, K_指数_数学, K_指数_数学, K_线段_数学, K_公式_数学, ...","[K_加法_数学, K_加法_数学, K_加法_数学, K_加法_数学, K_加法_数学, ...","[K_公式_数学, K_公式_数学, K_公式_数学, K_公式_数学, K_列表_数学, ...","[K_任务_计算机科学技术, K_除法_数学, K_除法_数学, K_除法_数学, K_除法...","[K_数值计算_数学, K_数值计算_数学, K_数值计算_数学, K_数值计算_数学, K..."
2,[0],[0],[0],[0],[0],"[K_病态_数学, K_病态_数学, K_病态_数学, K_排列_数学, K_排列_数学, ...","[K_比特_数学, K_比特_数学, K_等于_数学, K_容量_数学, K_直线_数学, ...","[K_指数_数学, K_指数_数学, K_指数_数学, K_符号_数学, K_语句_数学, ...",[0],"[K_比特_数学, K_比特_数学, K_等于_数学, K_容量_数学, K_直线_数学, ..."
3,[0],[0],"[K_指数_数学, K_指数_数学, K_指数_数学, K_除法_数学, K_除法_数学, ...","[K_孩子_计算机科学技术, K_孩子_计算机科学技术, K_孩子_计算机科学技术, K_方...","[K_概率_数学, K_概率_数学, K_迭代循环_数学, K_迭代循环_数学, K_迭代循...","[K_结合律_数学, K_结合律_数学, K_数值计算_数学, K_数值计算_数学, K_数...",[0],"[K_计算效率_数学, K_计算效率_数学, K_计算效率_数学, K_计算效率_数学, K...","[K_加法_数学, K_加法_数学, K_加法_数学, K_加法_数学, K_加法_数学, ...","[K_指数_数学, K_指数_数学, K_指数_数学, K_闭区间_数学, K_闭区间_数学..."
4,[0],[0],[0],[0],[0],"[K_指数_数学, K_指数_数学, K_指数_数学, K_概率分布_数学, K_概率分布_...","[K_指数_数学, K_指数_数学, K_指数_数学, K_结合律_数学, K_结合律_数学...","[K_指数_数学, K_指数_数学, K_指数_数学, K_任务_计算机科学技术, K_循环...","[K_直线_数学, K_对角线_数学, K_直和_数学, K_加法_数学, K_加法_数学,...","[K_实例_计算机科学技术, K_实例_计算机科学技术, K_实例_计算机科学技术, K_自..."
...,...,...,...,...,...,...,...,...,...,...
8956,[0],[0],"[K_控制系统_计算机科学技术, K_信号处理_计算机科学技术, K_比特_数学, K_比特...","[K_指数_数学, K_指数_数学, K_指数_数学, K_译码_数学, K_译码_数学, ...","[K_信号处理_计算机科学技术, K_控制系统_计算机科学技术, K_数据通道_计算机科学技...",[0],"[K_任务_计算机科学技术, K_容量_数学, K_计算机_计算机科学技术, K_服务器_计...","[K_直和_数学, K_符号_数学, K_排列_数学, K_排列_数学, K_排列_数学, ...","[K_任务_计算机科学技术, K_有效解_数学, K_跟踪_计算机科学技术, K_计算效率_...","[K_必要条件_数学, K_对角线_数学, K_全图_数学]"
8957,[0],[0],[0],[0],[0],"[K_聚合_数学, K_小于_数学, K_分支_数学, K_状态_数学, K_纵坐标_数学,...","[K_实例_计算机科学技术, K_实例_计算机科学技术, K_实例_计算机科学技术, K_方...","[K_指数_数学, K_指数_数学, K_指数_数学, K_除法_数学, K_除法_数学, ...","[K_任务_计算机科学技术, K_除法_数学, K_除法_数学, K_除法_数学, K_除法...",[0]
8958,[0],[0],[0],[0],"[K_比特_数学, K_比特_数学, K_传输层_计算机科学技术, K_传输层_计算机科学技...","[K_指数_数学, K_指数_数学, K_指数_数学, K_容量_数学, K_实根_数学, ...","[K_实例_计算机科学技术, K_实例_计算机科学技术, K_实例_计算机科学技术, K_上...","[K_指数_数学, K_指数_数学, K_指数_数学, K_除法_数学, K_除法_数学, ...",[0],"[K_跟踪_计算机科学技术, K_服务器_计算机科学技术, K_服务器_计算机科学技术, K..."
8959,[0],"[K_指数_数学, K_指数_数学, K_指数_数学, K_除法_数学, K_除法_数学, ...","[K_计算机_计算机科学技术, K_树算法_数学, K_树算法_数学, K_乘法_数学, K...","[K_实例_计算机科学技术, K_实例_计算机科学技术, K_实例_计算机科学技术, K_方...","[K_存储空间_数学, K_划分_数学]","[K_线段_数学, K_公式_数学, K_公式_数学, K_公式_数学, K_公式_数学, ...","[K_任务_计算机科学技术, K_有效解_数学, K_跟踪_计算机科学技术, K_计算效率_...","[K_有效解_数学, K_纵坐标_数学, K_多项式_数学, K_多项式_数学, K_多项式...","[K_指数_数学, K_指数_数学, K_指数_数学, K_除法_数学, K_除法_数学, ...","[K_计算效率_数学, K_计算效率_数学, K_计算效率_数学, K_计算效率_数学, K..."


The concept data consists of 10 columns from the first concepts until the final concepts order for each student (number or row represents a number of student).

In [None]:
data_prerequisite_concept

Unnamed: 0,prerequisite_1,prerequisite_2,prerequisite_3,prerequisite_4,prerequisite_5,prerequisite_6,prerequisite_7,prerequisite_8,prerequisite_9,prerequisite_10
0,[0],"[K_指令_计算机科学技术, K_算术_数学, K_整数_数学, K_加法_数学, K_计算...","[K_除法_数学, K_计算_数学, K_立方_数学, K_实轴_数学, K_区间_数学, ...","[K_算术_数学, K_整数_数学, K_加法_数学, K_计算_数学, K_算术_数学, ...","[K_程序设计_计算机科学技术, K_计算机_计算机科学技术, K_计算机_计算机科学技术,...","[K_常数_数学, K_算法_数学, K_算法_数学, K_程序设计_计算机科学技术, K_...","[K_状态方程_数学, K_整数_数学, K_数学_数学, K_整数_数学, K_正数_数学...","[K_等于_数学, K_算术_数学, K_等于_数学, K_基本运算_数学, K_数学_数学...","[K_除法_数学, K_计算_数学, K_立方_数学, K_算术_数学, K_整数_数学, ...","[K_队列_计算机科学技术, K_队列_计算机科学技术, K_队列_计算机科学技术, K_输..."
1,[0],[0],[0],[0],"[K_常数_数学, K_算法_数学, K_算法_数学, K_程序设计_计算机科学技术, K_...","[K_除法_数学, K_计算_数学, K_立方_数学, K_直线_数学, K_等式_数学, ...","[K_等于_数学, K_算术_数学, K_等于_数学, K_基本运算_数学, K_数学_数学...","[K_等式_数学, K_运算_数学, K_逻辑运算_数学, K_拟合_数学, K_数据结构_...","[K_指令_计算机科学技术, K_算术_数学, K_整数_数学, K_加法_数学, K_计算...","[K_加法_数学, K_减法_数学, K_下界_数学, K_乘法_数学, K_运算_数学, ..."
2,[0],[0],[0],[0],[0],"[K_向量_数学, K_代数_数学, K_运算_数学, K_代数_数学, K_数学_数学, ...","[K_正数_数学, K_二进制_数学, K_多项式_数学, K_数学_数学, K_正数_数学...","[K_除法_数学, K_计算_数学, K_立方_数学, K_等于_数学, K_代数的_数学,...",[0],"[K_正数_数学, K_二进制_数学, K_多项式_数学, K_数学_数学, K_正数_数学..."
3,[0],[0],"[K_除法_数学, K_计算_数学, K_立方_数学, K_算术_数学, K_整数_数学, ...","[K_叶节点_计算机科学技术, K_叶节点_计算机科学技术, K_连通图_计算机科学技术, ...","[K_代数的_数学, K_运算_数学, K_算法_数学, K_通项_数学, K_加法_数学,...","[K_逻辑运算_数学, K_基本运算_数学, K_加法_数学, K_减法_数学, K_下界_...",[0],"[K_常数_数学, K_算法_数学, K_算法_数学, K_程序设计_计算机科学技术, K_...","[K_等于_数学, K_算术_数学, K_等于_数学, K_基本运算_数学, K_数学_数学...","[K_除法_数学, K_计算_数学, K_立方_数学, K_实轴_数学, K_区间_数学, ..."
4,[0],[0],[0],[0],[0],"[K_除法_数学, K_计算_数学, K_立方_数学, K_概率_数学, K_运算_数学, ...","[K_除法_数学, K_计算_数学, K_立方_数学, K_逻辑运算_数学, K_基本运算_...","[K_除法_数学, K_计算_数学, K_立方_数学, K_指令_计算机科学技术, K_计算...","[K_直线_数学, K_代数_数学, K_等于_数学, K_算术_数学, K_等于_数学, ...","[K_计算机_计算机科学技术, K_计算机_计算机科学技术, K_计算机科学_计算机科学技术..."
...,...,...,...,...,...,...,...,...,...,...
8956,[0],[0],"[K_程序设计_计算机科学技术, K_正数_数学, K_二进制_数学, K_计算科学_计算机...","[K_除法_数学, K_计算_数学, K_立方_数学, K_二进制_数学, K_二进制_数学...","[K_程序设计_计算机科学技术, K_正数_数学, K_二进制_数学, K_计算科学_计算机...",[0],"[K_指令_计算机科学技术, K_计算科学_计算机科学技术, K_计算机科学_计算机科学技术...","[K_代数_数学, K_代数_数学, K_数学_数学, K_枚举_数学, K_决策_数学, ...","[K_指令_计算机科学技术, K_乘法_数学, K_计算机科学_计算机科学技术, K_常数_...","[K_直线_数学, K_连通图_数学]"
8957,[0],[0],[0],[0],[0],"[K_数学_数学, K_传递函数_数学, K_决策_数学, K_代数的_数学, K_拟合_数...","[K_计算机_计算机科学技术, K_计算机_计算机科学技术, K_计算机科学_计算机科学技术...","[K_除法_数学, K_计算_数学, K_立方_数学, K_算术_数学, K_整数_数学, ...","[K_指令_计算机科学技术, K_算术_数学, K_整数_数学, K_加法_数学, K_计算...",[0]
8958,[0],[0],[0],[0],"[K_正数_数学, K_二进制_数学, K_信息交换_计算机科学技术, K_以太网_计算机科...","[K_除法_数学, K_计算_数学, K_立方_数学, K_整数_数学, K_代数_数学, ...","[K_计算机_计算机科学技术, K_计算机_计算机科学技术, K_计算机科学_计算机科学技术...","[K_除法_数学, K_计算_数学, K_立方_数学, K_算术_数学, K_整数_数学, ...",[0],"[K_计算机科学_计算机科学技术, K_计算机科学_计算机科学技术, K_网络体系结构_计算..."
8959,[0],"[K_除法_数学, K_计算_数学, K_立方_数学, K_算术_数学, K_整数_数学, ...","[K_计算科学_计算机科学技术, K_算法_数学, K_算法_数学, K_加法_数学, K_...","[K_计算机_计算机科学技术, K_计算机_计算机科学技术, K_计算机科学_计算机科学技术...",[K_图论_数学],"[K_直线_数学, K_等式_数学, K_运算_数学, K_逻辑运算_数学, K_拟合_数学...","[K_指令_计算机科学技术, K_乘法_数学, K_计算机科学_计算机科学技术, K_常数_...","[K_乘法_数学, K_整数_数学, K_数学_数学, K_常数_数学, K_运算_数学, ...","[K_除法_数学, K_计算_数学, K_立方_数学, K_算术_数学, K_整数_数学, ...","[K_常数_数学, K_算法_数学, K_算法_数学, K_程序设计_计算机科学技术, K_..."


The prerequisite concept data consists of 10 columns from the first prerequisite concepts until the final prerequisite concepts order for each student (number or row represents a number of student).

## Label Encoding

### Course Label Dictionary

In [None]:
course_name = [i for i in np.unique(course_sample)]
course_number = [i for i in range(1, len(np.unique((course_sample)))+1)]
course_encode = {course_name[i]: course_number[i] for i in range(len(course_name))}

course_encode["0"] = 0
course_encode

{'C_course-v1:ACCA+FA1_X+2019_T1': 1,
 'C_course-v1:ACCA+FA1_X+sp': 2,
 'C_course-v1:ACCA+FA1_X_en+2019_T1': 3,
 'C_course-v1:ACCA+FA2_X+2019_T1': 4,
 'C_course-v1:ACCA+FA2_X+sp': 5,
 'C_course-v1:ACCA+FA2_X_en+2019_T1': 6,
 'C_course-v1:ACCA+MA1_X+2019_T1': 7,
 'C_course-v1:ACCA+MA1_X+sp': 8,
 'C_course-v1:ACCA+MA1_X_en+2019_T1': 9,
 'C_course-v1:ACCA+MA2_X+2019_T1': 10,
 'C_course-v1:ACCA+MA2_X+sp': 11,
 'C_course-v1:ACCA+MA2_X_en+2019_T1': 12,
 'C_course-v1:AdelaideX+Wine101x+sp': 13,
 'C_course-v1:AdelaideX+humbio101+sp': 14,
 'C_course-v1:BFU+15002360+2019_T1': 15,
 'C_course-v1:BFU+15012510+2019_T1': 16,
 'C_course-v1:BFU+15023710+2019_T1': 17,
 'C_course-v1:BFU+2018122709+2018_T2': 18,
 'C_course-v1:BFU+2018122709+2019_T1': 19,
 'C_course-v1:BFU+2018122710+2019_T1': 20,
 'C_course-v1:BIFT+1301990078+2019_T1': 21,
 'C_course-v1:BIFT+2018122901X+2018_T2': 22,
 'C_course-v1:BIFT+2018122902X+2018_T2': 23,
 'C_course-v1:BIT+100070018+2019_T1': 24,
 'C_course-v1:BIT+100070018+2019_T2'

### School Label Dictionary

In [None]:
school_name = [i for i in np.unique(list(school_course_relation['school']))]
school_number = [i for i in range(1, len(np.unique(list(school_course_relation['school'])))+1)]
school_encode = {school_name[i]: school_number[i] for i in range(len(school_name))}

school_encode["0"] = 0
school_encode

{'S_ACCA': 1,
 'S_AdelaideX': 2,
 'S_BFU': 3,
 'S_BIFT': 4,
 'S_BIT': 5,
 'S_BJTU': 6,
 'S_BJUT': 7,
 'S_BNU': 8,
 'S_BSU': 9,
 'S_BTBU': 10,
 'S_BUCM': 11,
 'S_BerkeleyX': 12,
 'S_BurgundyX': 13,
 'S_CCI': 14,
 'S_CHD': 15,
 'S_CIE': 16,
 'S_CMU': 17,
 'S_CSU': 18,
 'S_CUC': 19,
 'S_CUNY': 20,
 'S_CUP': 21,
 'S_CUPB': 22,
 'S_CWU': 23,
 'S_CityU': 24,
 'S_DUT': 25,
 'S_DYU': 26,
 'S_DelftX': 27,
 'S_EST': 28,
 'S_FAFU': 29,
 'S_FJNU': 30,
 'S_FJTCM': 31,
 'S_FUDANx': 32,
 'S_FZXY': 33,
 'S_GDUT': 34,
 'S_GIT': 35,
 'S_GXUST': 36,
 'S_GZHU': 37,
 'S_GZLIS': 38,
 'S_GZUCM': 39,
 'S_HAUT': 40,
 'S_HBNU': 41,
 'S_HBPU': 42,
 'S_HEBNU': 43,
 'S_HEBUT': 44,
 'S_HIT': 45,
 'S_HLJUX': 46,
 'S_HNU': 47,
 'S_HQU': 48,
 'S_HRBEU': 49,
 'S_HUBU': 50,
 'S_HUST': 51,
 'S_ICx': 52,
 'S_IMUN': 53,
 'S_JLU': 54,
 'S_JNU': 55,
 'S_JNUX': 56,
 'S_JSUX': 57,
 'S_JXUST': 58,
 'S_JinanX': 59,
 'S_KMUSTX': 60,
 'S_LUIBE': 61,
 'S_LZU': 62,
 'S_LinuxFoundationX': 63,
 'S_MIL': 64,
 'S_MIT_SLT_Lab': 65,
 'S_M

### Teacher Label Dictionary

In [None]:
teacher_unique = np.unique(list(dict_lookup_course_teacher.values()))
teacher_name = [i for i in teacher_unique if i != "0"]
teacher_number = [i for i in range(1, len(teacher_name)+1)]
teacher_encode = {teacher_name[i]: teacher_number[i] for i in range(len(teacher_name))}

teacher_encode["0"] = 0
teacher_encode

{'T_Adam Van Arsdale': 1,
 'T_Alma Moon Novotny': 2,
 'T_Armando Fox': 3,
 'T_Arno Smets': 4,
 'T_Barry Walsh': 5,
 'T_Bernard Freeman': 6,
 'T_Bill Aulet': 7,
 'T_Dr. Ariel Fenster': 8,
 'T_Gerald Sussman': 9,
 'T_Howard Takiff': 10,
 'T_J. Kim Vandiver': 11,
 'T_James Donald': 12,
 'T_Jason H. Hafner': 13,
 'T_Jayne Howson': 14,
 'T_Jeffrey D. Sachs': 15,
 'T_John Guttag': 16,
 'T_John Stix': 17,
 'T_Josie Adams': 18,
 'T_Karine Mardon': 19,
 'T_Kelvy Bird': 20,
 'T_Kevin Hsu': 21,
 'T_Kirsten Ostherr': 22,
 'T_Laura Zuheros': 23,
 'T_Lawrence Susskind': 24,
 'T_Maggie Sokolik': 25,
 'T_Marielle ADRIAN': 26,
 'T_Mario Ricci': 27,
 'T_Mark Horswill': 28,
 'T_Matthew Thompson': 29,
 'T_Paul Grbin': 30,
 'T_Pete Mumby': 31,
 'T_Ravi Ramamoorthi': 32,
 'T_Reid Whitaker': 33,
 'T_Sander Kersten': 34,
 'T_Tara Askham': 35,
 'T_Valli Rajagopal': 36,
 'T_丁雅琼': 37,
 'T_乔林': 38,
 'T_于丹': 39,
 'T_于汐': 40,
 'T_于泽': 41,
 'T_于玉贞': 42,
 'T_井福荣': 43,
 'T_任园春': 44,
 'T_任梦一': 45,
 'T_任艳频': 46,
 'T_伍晓明

### Labelling The Data

In [None]:
def encode_decode_data(data_name, column_name, dict_name):
  for i in column_name:
      data_name[i] = [dict_name[item] for item in data_name[i]]

  return data_name

In [None]:
data_course = encode_decode_data(data_course, course_columns, course_encode)
data_teacher = encode_decode_data(data_teacher, teacher_columns, teacher_encode)
data_school = encode_decode_data(data_school, school_columns, school_encode)

In [None]:
data_course

Unnamed: 0,course_1,course_2,course_3,course_4,course_5,course_6,course_7,course_8,course_9,course_10
0,0,348,366,331,437,370,420,371,419,351
1,0,0,0,0,370,415,371,405,348,78
2,0,0,0,0,0,332,326,309,586,327
3,0,0,364,408,523,368,259,370,371,366
4,0,0,0,0,0,463,604,434,582,275
...,...,...,...,...,...,...,...,...,...,...
8956,0,0,556,413,535,441,241,344,484,517
8957,0,0,0,0,449,548,209,215,348,586
8958,0,0,0,0,95,599,399,215,270,496
8959,0,504,595,209,507,340,484,119,215,558


In [None]:
data_teacher

Unnamed: 0,teacher_1,teacher_2,teacher_3,teacher_4,teacher_5,teacher_6,teacher_7,teacher_8,teacher_9,teacher_10
0,0,402,190,419,97,139,98,139,98,402
1,0,0,0,0,139,456,139,395,402,352
2,0,0,0,0,0,175,275,330,29,275
3,0,0,190,143,365,453,33,139,139,190
4,0,0,0,0,0,109,299,392,19,245
...,...,...,...,...,...,...,...,...,...,...
8956,0,0,198,46,117,102,455,129,373,310
8957,0,0,0,0,172,188,7,16,402,29
8958,0,0,0,0,337,34,144,16,6,476
8959,0,274,232,7,296,298,373,323,16,328


In [None]:
data_school

Unnamed: 0,school_1,school_2,school_3,school_4,school_5,school_6,school_7,school_8,school_9,school_10
0,0,103,103,103,103,103,103,103,103,103
1,0,0,0,0,103,103,103,103,103,130
2,0,0,0,0,0,103,103,101,107,103
3,0,0,103,103,103,103,86,103,103,103
4,0,0,0,0,0,103,115,103,107,88
...,...,...,...,...,...,...,...,...,...,...
8956,0,0,103,103,103,103,76,103,103,103
8957,0,0,0,0,103,103,66,66,103,107
8958,0,0,0,0,20,112,103,66,86,103
8959,0,103,111,66,103,103,103,32,66,103


Encode the data using label encoding technique using dictionary

### Inverse dictonary for decoding

In [None]:
course_decode = {y: x for x, y in course_encode.items()}
teacher_decode = {y: x for x, y in teacher_encode.items()}
school_decode = {y: x for x, y in school_encode.items()}

### Main data For Modelling:

In [None]:
data_course

Unnamed: 0,course_1,course_2,course_3,course_4,course_5,course_6,course_7,course_8,course_9,course_10
0,0,348,366,331,437,370,420,371,419,351
1,0,0,0,0,370,415,371,405,348,78
2,0,0,0,0,0,332,326,309,586,327
3,0,0,364,408,523,368,259,370,371,366
4,0,0,0,0,0,463,604,434,582,275
...,...,...,...,...,...,...,...,...,...,...
8956,0,0,556,413,535,441,241,344,484,517
8957,0,0,0,0,449,548,209,215,348,586
8958,0,0,0,0,95,599,399,215,270,496
8959,0,504,595,209,507,340,484,119,215,558


## Data Splitting

In [None]:
X = data_course.drop('course_10', axis=1)
Y = data_course['course_10']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 42)

In [None]:
rows_train = list(X_train.index)
rows_test = list(X_test.index)

# Data Understanding and Processing - Modelling

## Data Gathering

In [None]:
user_int_data = pd.read_json('/content/drive/MyDrive/MOOCCube/additional_information/implicit_feedback.txt', lines=True)
user_int_data

Unnamed: 0,id,activity
0,U_8126464,[{'course_id': 'C_course-v1:TsinghuaX+00740123...
1,U_8650752,[{'course_id': 'C_course-v1:TsinghuaX+00740043...
2,U_131074,[{'course_id': 'C_course-v1:TsinghuaX+30240184...
3,U_262145,[{'course_id': 'C_course-v1:BIT+BIT2016001+sp'...
4,U_1441801,[{'course_id': 'C_course-v1:TsinghuaX+20150001...
...,...,...
4995,U_2635246,[{'course_id': 'C_course-v1:KMUSTX+8209001+sp'...
4996,U_8009200,[{'course_id': 'C_course-v1:TsinghuaX+30240184...
4997,U_8795634,[{'course_id': 'C_course-v1:TsinghuaX+20240103...
4998,U_9188851,[{'course_id': 'C_course-v1:TsinghuaX+30240184...


Extract the user activity data from implicit feedback.

In [None]:
user_int_data['id']

0        U_8126464
1        U_8650752
2         U_131074
3         U_262145
4        U_1441801
           ...    
4995     U_2635246
4996     U_8009200
4997     U_8795634
4998     U_9188851
4999    U_10630648
Name: id, Length: 5000, dtype: object

## Getting the User Interaction

In [None]:
user_item_int_list = []

for i in range(user_int_data.shape[0]):
  for j in range(len(user_int_data['activity'][i])):
    user_item_int_list.append([user_int_data['id'][i],user_int_data['activity'][i][j]['course_id'],user_int_data['activity'][i][j]['watching_count']])

In [None]:
user_item_int_list

[['U_8126464', 'C_course-v1:TsinghuaX+00740123_X+sp', 3],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123X+2019_T1', 3],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123_X+sp', 5],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123X+2019_T1', 5],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123_X+sp', 5],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123X+2019_T1', 5],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123_X+sp', 10],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123X+2019_T1', 10],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123_X+sp', 7],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123X+2019_T1', 7],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123_X+sp', 3],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123X+2019_T1', 3],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123_X+sp', 1],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123X+2019_T1', 1],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123_X+sp', 4],
 ['U_8126464', 'C_course-v1:TsinghuaX+00740123X+2019_T1', 4],
 ['U_8126464', 'C_course-v1:TsinghuaX+

In [None]:
user_item_data = pd.DataFrame(user_item_int_list, columns =['user_id', 'course_id', 'interaction'])
user_item_data

Unnamed: 0,user_id,course_id,interaction
0,U_8126464,C_course-v1:TsinghuaX+00740123_X+sp,3
1,U_8126464,C_course-v1:TsinghuaX+00740123X+2019_T1,3
2,U_8126464,C_course-v1:TsinghuaX+00740123_X+sp,5
3,U_8126464,C_course-v1:TsinghuaX+00740123X+2019_T1,5
4,U_8126464,C_course-v1:TsinghuaX+00740123_X+sp,5
...,...,...,...
482220,U_10630648,C_course-v1:TsinghuaX+00740043_1x+2019_T1,1
482221,U_10630648,C_course-v1:TsinghuaX+00740043X_2015_T2+sp,1
482222,U_10630648,C_course-v1:TsinghuaX+00740043_1x+2019_T1,1
482223,U_10630648,C_course-v1:TsinghuaX+00740043X_2015_T2+sp,1


In [None]:
user_item_data = user_item_data.drop_duplicates(subset=["user_id", "course_id"], keep='first')

drop duplicated data

## Synchronize User in Interaction Data with Sample Data

In [None]:
id_list = list(set(list(user_data_sample['id'])) & set(list(user_item_data['user_id'])))

In [None]:
len(id_list)

868

In [None]:
cf_user = user_data_sample.loc[user_data_sample['id'].isin(id_list)]

In [None]:
cf_user

Unnamed: 0,id,name,course_order,enroll_time,number_of_course
165100,U_9188851,钦鸿风,"[C_course-v1:TsinghuaX+00740043_1x+2019_T1, C_...","[2019-03-16 17:54:59, 2018-06-18 00:34:58, 201...",5
141543,U_9315706,邸波,"[C_course-v1:TsinghuaX+10430484X_2015_2+sp, C_...","[2018-11-18 09:12:40, 2018-10-16 16:01:25, 201...",5
71014,U_8526833,计浩波,"[C_course-v1:TsinghuaX+2018032801X+2018_T1, C_...","[2018-06-19 00:13:39, 2018-02-25 22:26:55, 201...",5
134277,U_5514454,萧孤,"[C_course-v1:TsinghuaX+30240243X+sp, C_course-...","[2016-11-11 22:18:16, 2016-10-29 17:31:59, 201...",9
30099,U_9570663,捷凌萱,"[C_course-v1:TsinghuaX+00740123_X+sp, C_course...","[2018-09-16 15:38:29, 2018-10-25 12:31:59, 201...",5
...,...,...,...,...,...
168770,U_4589254,成靖儿,"[C_course-v1:TsinghuaX+80000901X_2+sp, C_cours...","[2018-08-01 14:24:50, 2017-08-04 18:21:03, 201...",9
26056,U_7473936,覃成礼,"[C_course-v1:TsinghuaX+00740043X_2015_T2+sp, C...","[2017-08-26 10:07:48, 2017-11-27 20:11:56, 201...",7
183749,U_10231978,笃永望,"[C_course-v1:TsinghuaX+00740113_1X+sp, C_cours...","[2019-02-25 10:47:26, 2018-12-22 19:46:08, 201...",5
179224,U_8400965,友寻真,"[C_course-v1:TsinghuaX+10610183_2X+sp, C_cours...","[2018-01-18 00:08:28, 2019-04-08 08:34:29, 201...",6


In [None]:
cf_user_ind = cf_user.index

In [None]:
cf_user_ind

Int64Index([165100, 141543,  71014, 134277,  30099, 186394,  86765,  45615,
            166966,  70096,
            ...
             14178,   8791, 198676, 140263,  35186, 168770,  26056, 183749,
            179224, 118769],
           dtype='int64', length=868)

In [None]:
rm_final_item_cf = []

for i in range(cf_user.shape[0]):
  rm_final_item_cf.append([cf_user.iloc[i][0],cf_user.iloc[i][2][-1]])

In [None]:
user_item_index = []
for i in range(len(rm_final_item_cf)):
  index_rm = user_item_data[(user_item_data['user_id'] == rm_final_item_cf[i][0]) & (user_item_data['course_id'] == rm_final_item_cf[i][1])].index
  user_item_index.append(index_rm)

In [None]:
new_index = [ele for ele in user_item_index if ele != "Int64Index([], dtype='int64')"]

  new_index = [ele for ele in user_item_index if ele != "Int64Index([], dtype='int64')"]


In [None]:
new_index_tr = []
for i in range(len(new_index)):
   new_index_tr.append(new_index[i][0])

In [None]:
new_index_tr

[482025,
 338177,
 332567,
 41377,
 358038,
 131401,
 342648,
 31484,
 405356,
 366807,
 53433,
 27108,
 186953,
 309304,
 159960,
 7562,
 112895,
 6628,
 351208,
 417264,
 205001,
 186983,
 19923,
 116131,
 148867,
 88269,
 193968,
 339752,
 403525,
 204932,
 2849,
 333411,
 394934,
 209835,
 184704,
 366300,
 198716,
 183627,
 258093,
 135728,
 229454,
 119385,
 462762,
 344205,
 387649,
 443232,
 166316,
 257551,
 74158,
 79012,
 264828,
 427622,
 444321,
 115434,
 380420,
 17100,
 343478,
 131125,
 112704,
 246613,
 50662,
 375212,
 158668,
 75481,
 66824,
 107420,
 477491,
 96377,
 121642,
 449611,
 299057,
 463627,
 71612,
 436036,
 141862,
 58709,
 351434,
 222041,
 357159,
 214767,
 305709,
 352589,
 197202,
 449871,
 343449,
 463783,
 177456,
 473994,
 43121,
 233329,
 438114,
 273098,
 338620,
 379484,
 468803,
 465663,
 302308,
 4560,
 215005,
 149249,
 200143,
 99591,
 166963,
 36953,
 334221,
 88333,
 267386,
 96406,
 246493,
 374489,
 14880,
 42811,
 299035,
 368431,
 351

In [None]:
user_item_data.drop(index = new_index_tr, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_data.drop(index = new_index_tr, inplace = True)


In [None]:
user_item_data

Unnamed: 0,user_id,course_id,interaction
0,U_8126464,C_course-v1:TsinghuaX+00740123_X+sp,3
1,U_8126464,C_course-v1:TsinghuaX+00740123X+2019_T1,3
18,U_8126464,C_course-v1:UST+UST001+sp,1
19,U_8126464,C_course-v1:TsinghuaX+00740043X_2015_T2+sp,2
20,U_8126464,C_course-v1:TsinghuaX+30240184+sp,5
...,...,...,...
482026,U_9188851,C_course-v1:TsinghuaX+00740043_1x+2019_T1,1
482035,U_10630648,C_course-v1:TsinghuaX+00740043X_2015_T2+sp,1
482036,U_10630648,C_course-v1:TsinghuaX+00740043_1x+2019_T1,1
482135,U_10630648,C_course-v1:TsinghuaX+00740043_2x+2019_T1,2


## Add More Courses Into Dictionaries

In [None]:
course_info_data = pd.read_json('/content/drive/MyDrive/MOOCCube/entities/course.json', lines=True)
course_info_data

Unnamed: 0,id,name,prerequisites,about,core_id,video_order,display_name,chapter
0,C_course-v1:McGillX+ATOC185x+2015_T1,自然灾害（自主模式）,无,<p>地球上没有一处地方不发生自然灾害。当我们以科学的眼光看待这些自然灾害的原因和本质时，我...,C_course-v1:McGillX+ATOC185x+2015_T1,"[V_f6f710068b994452885b90e11b6ee5c5, V_7339568...","[Video: Overview 1, Video: Overview 2, Video: ...","[01.02.01.02, 01.02.03.02, 01.02.05.02, 01.02...."
1,C_course-v1:TsinghuaX+THESIS2015X+2015_T1,2015年清华大学研究生学位论文答辩（二）,无先修要求,<p>学位论文答辩环节是研究生培养的重要环节，为了充分发挥该环节的育人作用，搭建学术交流的平...,C_course-v1:TsinghuaX+THESIS2015X+2015_T1,"[V_de0371575a9f4b5391c89ad16d68b5c2, V_d632034...","[答辩陈述, 答辩陈述, 问答及答辩结果, 答辩陈述, 问答及答辩结果, 答辩陈述, 问答及...","[01.01.03.01, 01.02.03.01, 01.02.04.01, 01.03...."
2,C_course-v1:TsinghuaX+THESIS2014_1X_tv+_2014_,2014年清华大学研究生学位论文答辩（一）,无先修要求,<p>学位论文答辩环节是研究生培养的重要环节，为了充分发挥该环节的育人作用，搭建学术交流的平...,C_course-v1:TsinghuaX+THESIS2014_1X_tv+_2014_,"[V_d530be9cc0584317a16706684577a6dd, V_f329a62...","[论文答辩实况, 问答及答辩结果, 导师评价, 同学眼中的王鑫, 个人学术感言, 吴宇恩答辩...","[01.01.03.01, 01.01.04.01, 01.01.05.01, 01.01...."
3,C_course-v1:TsinghuaX+THESIS2015X_tv+2015_T1,2015年清华大学研究生学位论文答辩（一）,无先修要求,<p>学位论文答辩环节是研究生培养的重要环节，为了充分发挥该环节的育人作用，搭建学术交流的平...,C_course-v1:TsinghuaX+THESIS2015X+2015_T1,"[V_de0371575a9f4b5391c89ad16d68b5c2, V_78a8b41...","[答辩陈述, 问答及答辩结果, 答辩陈述, 问答及答辩结果, 答辩陈述, 问答及答辩结果, ...","[01.01.03.01, 01.01.04.01, 01.02.03.01, 01.02...."
4,C_course-v1:TsinghuaX+00690242+sp,文物精品与文化中国（自主模式）,无,<p>中国考古学是以往100年中发展最为迅速的领域之一，大批珍贵文物的出土，不断刷新人们对文...,C_course-v1:TsinghuaX+00690242+sp,"[V_d7dbd0fe8f504e7a91d863cd5a19b185, V_4492eca...","[文献所见原始渡河工具, 《禹贡》所见的水路交通, 绰墩山出土的渡河浮木, 舟船考古发掘, ...","[01.01.01.01, 01.01.02.01, 01.02.01.01, 01.03...."
...,...,...,...,...,...,...,...,...
701,C_course-v1:TW+2018052501X+2019_T3,STEM课程设计与案例分析（2019秋）,无,<p>\r\n\t《STEM课程设计与案例分析》课程，它是：<br />\r\n1.国内领先...,C_course-v1:TW+2018052501X+2019_T3,"[V_8f28c813854c444f98950e5404301f0b, V_f23a8ad...","[何为 STEM 教育, STEM 教育的起源与发展, 世界各国的 STEM 教育政策, 为...","[01.01.01.02, 01.02.01.01, 01.03.01.01, 01.04...."
702,C_course-v1:TsinghuaX+60700052X+2019_T2,数据科学导论（2019夏）,本课程适合各类学科背景学生学习,本课程作为数据科学的先导课和认知类课程，致力于以形象生动的教学模式为学生普及数据挖掘、大数据...,C_course-v1:TsinghuaX+60700052X+2019_T1,"[V_2aa0936927744c3b9f1dcc8b5bc30fba, V_2def333...","[Video, Video, Video, Video, Video, Video, Vid...","[01.01.01.01, 01.02.01.01, 01.03.01.01, 01.04...."
703,C_course-v1:TsinghuaX+80515182X+2019_sp,麦肯锡“全球领导力”自主模式,Basic knowledge of business administration,<p>\r\n\t脱胎于清华经管学院与麦肯锡公司联合开设的线下金牌课程，麦肯锡全球领导力在线...,C_course-v1:TsinghuaX+80515182X+2019_sp,"[V_ab97fe9d9a744958b886aa74d44146f8, V_8973856...","[Video, Video, Video, Video, Video, Video, Vid...","[01.01.01.01, 01.02.01.01, 01.03.01.01, 02.01...."
704,C_course-v1:NEU+2019012201X+2019_T2,物理化学（上）（2019夏）,高等数学,物理化学（上）是一门运用物理学手段和技术研究化学问题的学科，被誉为化学学科的“大脑”。本课程...,C_course-v1:NEU+2019012201X+2019_T2,"[V_6595d9e0894848348287b8abaa30dce6, V_4dc4a13...","[Video, Video, Video, Video, Video, Video, Vid...","[01.01.01.01, 02.01.01.01, 02.02.01.01, 02.03...."


In [None]:
course_list_diff = list(set(list(course_info_data['id'])).symmetric_difference(set(list(course_encode.keys()))))
course_list_diff

['C_course-v1:CIE+CIE2017008+2019_T1',
 'C_course-v1:KMUSTX+1803168+2018_T2',
 'C_course-v1:SDSNAssociation+ECD001+sp',
 'C_course-v1:PSFFC+2018102404X+2018_T2',
 'C_course-v1:HNU+20180424001+2018_T1',
 'C_course-v1:TsinghuaX+80511503X+2019_T1',
 'C_course-v1:HUBU+HU08001X+2019_T1',
 'C_course-v1:XYSFXY+20181024X+2019_T1',
 'C_course-v1:nxu+2018122713+2019_T1',
 'C_course-v1:NEU+2018051501+sp',
 'C_course-v1:SDSNAssociation+TOW001+sp',
 'C_course-v1:ZZU+20180116001+2019_T1',
 'C_course-v1:SDSNAssociation+PB+sp',
 'C_course-v1:BSU+2018122404X+2019_T1',
 'C_course-v1:dlmu+20180906+2019_T1',
 'C_course-v1:HIT+HIT2016001+2019_T1',
 'C_course-v1:FAFU+55071003+2019_T1',
 'C_course-v1:FZXY+20180301001+2019_T1',
 'C_course-v1:SDUx+00931800X+sp',
 'C_course-v1:CIE+CIE2016004+2019_T1',
 'C_course-v1:TsinghuaX+80240372X+sp',
 'C_course-v1:JNU+07009215+2019_T1',
 'C_course-v1:CIE+CIE2016002+2019_T1',
 'C_course-v1:XJTU+2018122507X+2018_T2',
 'C_course-v1:CSMZXY+2018111301X+2019_T2',
 'C_course-v1:

In [None]:
course_encode

{'C_course-v1:ACCA+FA1_X+2019_T1': 1,
 'C_course-v1:ACCA+FA1_X+sp': 2,
 'C_course-v1:ACCA+FA1_X_en+2019_T1': 3,
 'C_course-v1:ACCA+FA2_X+2019_T1': 4,
 'C_course-v1:ACCA+FA2_X+sp': 5,
 'C_course-v1:ACCA+FA2_X_en+2019_T1': 6,
 'C_course-v1:ACCA+MA1_X+2019_T1': 7,
 'C_course-v1:ACCA+MA1_X+sp': 8,
 'C_course-v1:ACCA+MA1_X_en+2019_T1': 9,
 'C_course-v1:ACCA+MA2_X+2019_T1': 10,
 'C_course-v1:ACCA+MA2_X+sp': 11,
 'C_course-v1:ACCA+MA2_X_en+2019_T1': 12,
 'C_course-v1:AdelaideX+Wine101x+sp': 13,
 'C_course-v1:AdelaideX+humbio101+sp': 14,
 'C_course-v1:BFU+15002360+2019_T1': 15,
 'C_course-v1:BFU+15012510+2019_T1': 16,
 'C_course-v1:BFU+15023710+2019_T1': 17,
 'C_course-v1:BFU+2018122709+2018_T2': 18,
 'C_course-v1:BFU+2018122709+2019_T1': 19,
 'C_course-v1:BFU+2018122710+2019_T1': 20,
 'C_course-v1:BIFT+1301990078+2019_T1': 21,
 'C_course-v1:BIFT+2018122901X+2018_T2': 22,
 'C_course-v1:BIFT+2018122902X+2018_T2': 23,
 'C_course-v1:BIT+100070018+2019_T1': 24,
 'C_course-v1:BIT+100070018+2019_T2'

In [None]:
course_encode['C_course-v1:nxu+2018122713+2019_T1'] = 661
course_encode['C_course-v1:SDSNAssociation+ECD001+sp'] = 662
course_encode['C_course-v1:SWPU+3615001035+2019_T1'] = 663
course_encode['C_course-v1:LUIBE+201808064+2019_T1'] = 664
course_encode['C_course-v1:nxu+2018122711+2019_T1'] = 665
course_encode['C_course-v1:CIE+JD_2017+2019_T1'] = 666
course_encode['C_course-v1:dlmu+20180906+2019_T1'] = 667
course_encode['C_course-v1:BNU+2018122405X+2019_T1'] = 668
course_encode['C_course-v1:CIE+CIE2016005+2019_T1'] = 669
course_encode['C_course-v1:HIT+HIT2016001+2019_T1'] = 670
course_encode['C_course-v1:SDSNAssociation+PB+sp'] = 671
course_encode['C_course-v1:CIE+CIE2017008+2019_T1'] = 672
course_encode['C_course-v1:FZXY+20180301001+2019_T1'] = 673
course_encode['C_course-v1:CIE+CIE2017005+2019_T1'] = 674
course_encode['C_course-v1:TsinghuaX+80511503X+2019_T1'] = 675
course_encode['C_course-v1:CIE+CIE2017001+2019_T1'] = 676
course_encode['C_course-v1:SDSNAssociation+SC001+sp'] = 677
course_encode['C_course-v1:TJUFE+2018122506X+2019_T1'] = 678
course_encode['C_course-v1:TsinghuaX+01510192X+2019_T1'] = 679
course_encode['C_course-v1:XYSFXY+20181024X+2019_T1'] = 680
course_encode['C_course-v1:HNU+20180424001+2018_T1'] = 681
course_encode['C_course-v1:CSMZXY+2018111301X+2019_T2'] = 682
course_encode['C_course-v1:TsinghuaX+80240372X+sp'] = 683
course_encode['C_course-v1:JNU+11020009+2019_T1'] = 684
course_encode['C_course-v1:BSU+2018122405X+2019_T1'] = 685
course_encode['C_course-v1:SEU+00690803_2+2019_T1'] = 686
course_encode['C_course-v1:PSFFC+2018102404X+2018_T2'] = 687
course_encode['C_course-v1:NEU+2018051501+sp'] = 688
course_encode['C_course-v1:CSU+2019043001X+2019_T1'] = 689
course_encode['C_course-v1:SDSNAssociation+TOW001+sp'] = 690
course_encode['C_course-v1:CIE+CIE2016004+2019_T1'] = 691
course_encode['C_course-v1:SXPI+20171101002+sp'] = 692
course_encode['C_course-v1:KMUSTX+1803168+2018_T2'] = 693
course_encode['C_course-v1:SDUx+00931800X+sp'] = 694
course_encode['C_course-v1:BSU+2018122404X+2019_T1'] = 695
course_encode['C_course-v1:BNU+CHE21128011+2019_T1'] = 696
course_encode['C_course-v1:ZZU+20180116001+2019_T1'] = 697
course_encode['C_course-v1:SDSNAssociation+CSN+sp'] = 698
course_encode['C_course-v1:TsinghuaX+70340063X+2019_T1'] = 699
course_encode['C_course-v1:HEBUT+2019040202X+2019_T1'] = 700
course_encode['C_course-v1:FAFU+55071003+2019_T1'] = 701
course_encode['C_course-v1:XJTU+2018122507X+2018_T2'] = 702
course_encode['C_course-v1:JNU+07009215+2019_T1'] = 703
course_encode['C_course-v1:CIE+CIE2016002+2019_T1'] = 704
course_encode['C_course-v1:BNU+2017112001X+2019_T1'] = 705
course_encode['C_course-v1:HUBU+HU08001X+2019_T1'] = 706

In [None]:
dict_lookup_course_school

{'C_course-v1:BNU+CSL21148501+2018_T2': 'S_BNU',
 'C_course-v1:BNU+GE310141091+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018122603X+2018_T2': 'S_BNU',
 'C_course-v1:BNU+CSL21126882+2019_T1': 'S_BNU',
 'C_course-v1:BNU+0610073991+2019_T1': 'S_BNU',
 'C_course-v1:BNU+1010070372+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2017053101X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+GE410081071-01+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018122602X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018091301X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018091302X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018091303X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+PHI2107404101+2019_T1': 'S_BNU',
 'C_course-v1:BNU+0610073981+2019_T1': 'S_BNU',
 'C_course-v1:BNU+0210021441+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2017071001X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018091305X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2018011801X+2019_T1': 'S_BNU',
 'C_course-v1:BNU+ENV13018+2019_T1': 'S_BNU',
 'C_course-v1:BNU+2017001+2018_T2': 'S_BNU',
 'C_course-v1:BNU+GOV21089

In [None]:
course_list_diff = list(set(list(course_info_data['id'])).symmetric_difference(set(list(dict_lookup_course_school.keys()))))
course_list_diff

['C_course-v1:CIE+CIE2017008+2019_T1',
 'C_course-v1:KMUSTX+1803168+2018_T2',
 'C_course-v1:SDSNAssociation+ECD001+sp',
 'C_course-v1:PSFFC+2018102404X+2018_T2',
 'C_course-v1:HNU+20180424001+2018_T1',
 'C_course-v1:TsinghuaX+80511503X+2019_T1',
 'C_course-v1:HUBU+HU08001X+2019_T1',
 'C_course-v1:XYSFXY+20181024X+2019_T1',
 'C_course-v1:nxu+2018122713+2019_T1',
 'C_course-v1:NEU+2018051501+sp',
 'C_course-v1:SDSNAssociation+TOW001+sp',
 'C_course-v1:ZZU+20180116001+2019_T1',
 'C_course-v1:SDSNAssociation+PB+sp',
 'C_course-v1:BSU+2018122404X+2019_T1',
 'C_course-v1:dlmu+20180906+2019_T1',
 'C_course-v1:HIT+HIT2016001+2019_T1',
 'C_course-v1:FAFU+55071003+2019_T1',
 'C_course-v1:FZXY+20180301001+2019_T1',
 'C_course-v1:SDUx+00931800X+sp',
 'C_course-v1:CIE+CIE2016004+2019_T1',
 'C_course-v1:TsinghuaX+80240372X+sp',
 'C_course-v1:JNU+07009215+2019_T1',
 'C_course-v1:CIE+CIE2016002+2019_T1',
 'C_course-v1:XJTU+2018122507X+2018_T2',
 'C_course-v1:CSMZXY+2018111301X+2019_T2',
 'C_course-v1:

In [None]:
dict_lookup_course_teacher

{'C_course-v1:SPI+20170828001x+sp': 'T_姚常青',
 'C_course-v1:SXPI+20170828001x+2019_T1': 'T_姚常青',
 'C_course-v1:PSFFC+2018102405X+2018_T2': 'T_赵石楠',
 'C_course-v1:TsinghuaX+70150104_2X+2019_T1': 'T_连小珉',
 'C_course-v1:TsinghuaX+70150104X+2019_T1': 'T_连小珉',
 'C_course-v1:TJU+2010241X+sp': 'T_张德顺',
 'C_course-v1:TsinghuaX+20250103X+sp': 'T_任艳频',
 'C_course-v1:KMUSTX+8219011+2019_T1': 'T_王裕森',
 'C_course-v1:BIFT+2018122901X+2018_T2': 'T_丁雅琼',
 'C_course-v1:PSFFC+2018102403X+2018_T2': 'T_陶昆',
 'C_course-v1:BUCM+2018122604X+2018_T2': 'T_周芬',
 'C_course-v1:BUCM+2018122604X+2019_T1': 'T_周芬',
 'C_course-v1:TsinghuaX+20180919X+2019_T1': 'T_靳卫萍',
 'C_course-v1:SCUT+145033+sp': 'T_杨俊荣',
 'C_course-v1:BNU+2018091303X+2019_T1': 'T_孙璞',
 'C_course-v1:BNU+2018091304X+2019_T1': 'T_杨兆春',
 'C_course-v1:HIT+13SC20301820+2019_T1': 'T_孙洁',
 'C_course-v1:TsinghuaX+40050455_2X+sp': 'T_黄霞',
 'C_course-v1:nxu+2018122711+2018_T2': 'T_段玉泉',
 'C_course-v1:TsinghuaX+60610231+2016_T2_SP': 'T_王  前',
 'C_course-v1:Tsin

In [None]:
course_list_diff = list(set(list(course_info_data['id'])).symmetric_difference(set(list(dict_lookup_course_teacher.keys()))))
course_list_diff

['C_course-v1:CIE+CIE2017008+2019_T1',
 'C_course-v1:KMUSTX+1803168+2018_T2',
 'C_course-v1:SDSNAssociation+ECD001+sp',
 'C_course-v1:PSFFC+2018102404X+2018_T2',
 'C_course-v1:HNU+20180424001+2018_T1',
 'C_course-v1:TsinghuaX+80511503X+2019_T1',
 'C_course-v1:HUBU+HU08001X+2019_T1',
 'C_course-v1:XYSFXY+20181024X+2019_T1',
 'C_course-v1:nxu+2018122713+2019_T1',
 'C_course-v1:NEU+2018051501+sp',
 'C_course-v1:SDSNAssociation+TOW001+sp',
 'C_course-v1:ZZU+20180116001+2019_T1',
 'C_course-v1:SDSNAssociation+PB+sp',
 'C_course-v1:BSU+2018122404X+2019_T1',
 'C_course-v1:dlmu+20180906+2019_T1',
 'C_course-v1:HIT+HIT2016001+2019_T1',
 'C_course-v1:FAFU+55071003+2019_T1',
 'C_course-v1:FZXY+20180301001+2019_T1',
 'C_course-v1:SDUx+00931800X+sp',
 'C_course-v1:CIE+CIE2016004+2019_T1',
 'C_course-v1:TsinghuaX+80240372X+sp',
 'C_course-v1:JNU+07009215+2019_T1',
 'C_course-v1:CIE+CIE2016002+2019_T1',
 'C_course-v1:XJTU+2018122507X+2018_T2',
 'C_course-v1:CSMZXY+2018111301X+2019_T2',
 'C_course-v1:

## Label Encoding

In [None]:
user_item_data['course_id'] = [course_encode[item] for item in user_item_data['course_id']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_data['course_id'] = [course_encode[item] for item in user_item_data['course_id']]


In [None]:
user_item_data

Unnamed: 0,user_id,course_id,interaction
0,U_8126464,355,3
1,U_8126464,354,3
18,U_8126464,587,1
19,U_8126464,348,2
20,U_8126464,434,5
...,...,...,...
482026,U_9188851,349,1
482035,U_10630648,348,1
482036,U_10630648,349,1
482135,U_10630648,350,2


Encode course id using dictionary.

## User Course Matrix

In [None]:
user_item_data_matrix = user_item_data.pivot(index='user_id', columns='course_id', values='interaction')
user_item_data_matrix = user_item_data_matrix.fillna(0)

In [None]:
user_item_data_matrix

course_id,1,2,3,4,5,8,11,13,14,15,...,685,687,690,691,695,697,698,699,705,706
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U_10093038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10093394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10093784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10093837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10093893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U_9974853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_9975025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_9975096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_9975105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from scipy.sparse import csr_matrix
mat_items_users=csr_matrix(user_item_data_matrix.values)
mat_items_users

<5000x602 sparse matrix of type '<class 'numpy.float64'>'
	with 27526 stored elements in Compressed Sparse Row format>

Using CSR matrix to reduce the sparsity

# Modelling: CF

In [None]:
from sklearn.neighbors import NearestNeighbors
model_knn= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(mat_items_users)

In [None]:
user_item_data

Unnamed: 0,user_id,course_id,interaction
0,U_8126464,355,3
1,U_8126464,354,3
18,U_8126464,587,1
19,U_8126464,348,2
20,U_8126464,434,5
...,...,...,...
482026,U_9188851,349,1
482035,U_10630648,348,1
482036,U_10630648,349,1
482135,U_10630648,350,2


In [None]:
def Recommender(course_name):
    model_knn.fit(mat_items_users)
    course_index = course_name
    distances, indices = model_knn.kneighbors(mat_items_users[course_index], n_neighbors=6)
    recc_course_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
    recommend_frame = []
    for val in recc_course_indices:
        recommend_frame.append(user_item_data['course_id'].iloc[val[0]])

    return recommend_frame

In [None]:
Y_pred = []

for i in X_test['course_9']:
  Y_pred.append(list(Recommender(i)))

In [None]:
Y_pred = pd.DataFrame(Y_pred)

In [None]:
Y_pred

Unnamed: 0,0,1,2,3,4
0,434,355,503,406,348
1,445,155,582,449,586
2,309,434,406,503,483
3,558,301,115,408,321
4,309,222,437,515,408
...,...,...,...,...,...
1788,484,449,348,449,415
1789,437,449,342,635,26
1790,209,172,625,408,535
1791,227,364,158,434,290


the top 5 predicted courses for all users in the test data.

# Evaluation - Actualness

The actualness evaluation represents how well the model predicts the final course sequence compared to the actual sequence. This evaluation is biased when the data quality is low. For example, people may take different subjects which causes the course sequence to become very random reducing the model's ability to predict the final course sequence.

In [None]:
import math
from statistics import mean

def apk(actual, predicted, k=5):

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):

        if p == actual:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score/k

CF_map_5_actual = []



for i in range(len(Y_test)):
  CF_map_5_actual.append(apk(Y_test.iloc[i],list(Y_pred.iloc[i]), k=5))

map_CF_actual = mean(CF_map_5_actual)
map_CF_actual

0.004948875255623722

In [None]:
def hr(actual, predicted, k=5):

    num_hits = 0.0

    for i,p in enumerate(predicted):

        if p == actual:
            num_hits = 1.0
            pass

    return num_hits

CF_hit_5_actual = []

for i in range(len(Y_test)):
  CF_hit_5_actual.append(hr(Y_test.iloc[i],list(Y_pred.iloc[i]), k=5))

hr_CF_actual = mean(CF_hit_5_actual)
hr_CF_actual

0.05298382598996096

In [None]:
def rr(actual, predicted, k=5):

    num_hits = 0.0
    score = 0.0

    for i,p in enumerate(predicted):

        if p == actual:
            num_hits = 1.0
            score = num_hits/(i+1)
            pass

    return score

CF_rr_5_actual = []

for i in range(len(Y_test)):
  CF_rr_5_actual.append(rr(Y_test.iloc[i],list(Y_pred.iloc[i]), k=5))

rr_CF_actual = mean(CF_rr_5_actual)
rr_CF_actual

0.023238520171035507

In [None]:
def ndcg(actual, predicted, k=5):


    dcg = []
    idcg = []
    gain = 0.0

    for i in range(1,k+1):
      idcg.append(1/(math.log2(i+1)))

    for i,p in enumerate(predicted):

        if p == actual:
            gain = 1.0
            dcg.append(gain/(math.log2((i+1)+1)))

        else:
            dcg.append(0)

    return (sum(dcg))/(sum(idcg))

CF_ndcg_5_actual = []

for i in range(len(Y_test)):
  CF_ndcg_5_actual.append(ndcg(Y_test.iloc[i],list(Y_pred.iloc[i]), k=5))

ndcg_CF_actual = mean(CF_ndcg_5_actual)
ndcg_CF_actual

0.010745129220730633

In [None]:
evaluation_actual_list = [[map_CF_actual, hr_CF_actual, rr_CF_actual, ndcg_CF_actual]]

In [None]:
evaluation_actual = pd.DataFrame(evaluation_actual_list, columns = ['Mean Average Precision','Hit Ratio', 'Reciprocal Rank', "Normalized Discounted Cumulative Gain"])

In [None]:
evaluation_actual

Unnamed: 0,Mean Average Precision,Hit Ratio,Reciprocal Rank,Normalized Discounted Cumulative Gain
0,0.004949,0.052984,0.023239,0.010745
