In [3]:
import random
import pandas as pd
import tqdm
import numpy as np

data = pd.read_csv(
    'data/assist09/raw/skill_builder_data_corrected.csv',
    usecols=['order_id', 'user_id', 'problem_id', 'skill_id', 'correct']
).dropna(subset=['skill_id'])

In [4]:
# 建立练习映射,编号从1~n
raw_problem=data.problem_id.unique().tolist()
raw_problem.sort()
num_problem=len(raw_problem)
problems={p:i+1 for i,p in enumerate(raw_problem)}
print("number of problems: %d" % num_problem)
np.save('data/assist09/map/eMap.npy',problems)

number of problems: 17751


In [5]:
#将problem_id替换成eMap的value
problems=np.load('data/assist09/map/eMap.npy',allow_pickle=True).item()
data=data.replace({"problem_id":problems})

In [6]:
# 建立技能映射，编号从0~n-1
raw_question = data.skill_id.unique().tolist()
num_skill = len(raw_question)

skills = { p: i for i, p in enumerate(raw_question) }
print("number of skills: %d" % num_skill)
np.save('data/assist09/map/cMap.npy',skills)

number of skills: 123


In [7]:
#将skill_id替换成cMap的value
skills=np.load('data/assist09/map/cMap.npy',allow_pickle=True).item()
data=data.replace({"skill_id":skills})

In [8]:
from sklearn.preprocessing import normalize
# 建立练习-技能-邻接矩阵
adj_problem_skill=np.zeros((num_problem+1,num_skill))
single_problem_skill_pair=data.drop_duplicates(subset=['problem_id','skill_id'])[['problem_id','skill_id']].sort_values(by=['problem_id'])
print(single_problem_skill_pair)
for i,row in single_problem_skill_pair.iterrows():
    # adj_problem_skill[problems[row['problem_id']]][skills[row['skill_id']]]=1
    adj_problem_skill[int(row['problem_id'])][int(row['skill_id'])]=1
#保存e2c邻接矩阵
np.save('data/assist09/adj/e2cAdj.npy',adj_problem_skill)
#保存e2c-归一化的邻接矩阵
norm_e2c=normalize(adj_problem_skill,norm='l1',axis=1)
np.save('data/assist09/adj/e2cAdjNorm.npy',norm_e2c)

        problem_id  skill_id
146242           1      39.0
191473           2      49.0
112391           3      31.0
191472           4      49.0
112390           4      31.0
...            ...       ...
253931       17747      81.0
253932       17748      81.0
253942       17749      81.0
253933       17750      81.0
72043        17751      20.0

[21246 rows x 2 columns]


In [9]:
# 将每个学生的答题序列分好块
def parse_all_seq(students):
    all_sequences = []
    for student_id in tqdm.tqdm(students, 'parse student sequence:\t'):
        student_sequence = parse_student_seq(data[data.user_id == student_id])
        all_sequences.extend([student_sequence])
    return all_sequences


def parse_student_seq(student):
    seq = student.sort_values('order_id')
    return seq['problem_id'].values,seq['correct'].values

data=data.drop_duplicates(subset=['order_id'])
print(data)
sequences = parse_all_seq(sorted(data.user_id.unique()))
print('总的学生人数：',len(sequences))

        order_id  user_id  problem_id  correct  skill_id
0       33022537    64525        1069        1       0.0
1       33022709    64525        1080        1       0.0
2       35450204    70363        1089        0       0.0
3       35450295    70363        1040        1       0.0
4       35450311    70363        1126        0       0.0
...          ...      ...         ...      ...       ...
337996  33150408    85730       15444        0     122.0
337997  33150487    85730       15458        1     122.0
337998  33150779    85730       15448        1     122.0
337999  33151098    85730       15459        0     122.0
338000  31950415    87896       15415        0     122.0

[283105 rows x 5 columns]


parse student sequence:	: 100%|██████████| 4163/4163 [00:03<00:00, 1236.81it/s]

总的学生人数： 4163





In [10]:
print(type(sequences[0]))
print(type(sequences))
print(sequences[0])

<class 'tuple'>
<class 'list'>
(array([12668, 12692, 12685, 12704, 12705, 12700, 12708,  2993,  3182,
        2977,  3173,  3168, 12032, 12242, 12231, 11732, 12213, 11712,
       11715]), array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0]))


In [11]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10,shuffle=True)
allFoldSeq=[]

for train_index,test_index in kf.split(sequences):
    allFoldSeq.append((np.array(sequences)[train_index], np.array(sequences)[test_index]))

  allFoldSeq.append((np.array(sequences)[train_index], np.array(sequences)[test_index]))


In [12]:
np.save('data/assist09/raw/allFoldSeq.npy',allFoldSeq)

  arr = np.asanyarray(arr)


In [13]:
adj_problem_skill=np.load('data/assist09/adj/e2cAdj.npy')
def splitToMaxStep(sequences,maxstep):
    e_data=[]
    a_data=[]
    for e_features,a in tqdm.tqdm(sequences, 'splitting into MaxStep: '):
        
        length=e_features.shape[0]
        slices = length//maxstep + (1 if length%maxstep > 0 else 0)
        for i in range(slices):
            e_temp = np.zeros(shape=[maxstep,1])
            a_temp = np.zeros(shape=[maxstep,1])
            if length>0:
                if length>=maxstep:
                    l=maxstep
                else:
                    l=length
                for j in range(l):
                    e_temp[j]=e_features[i*maxstep+j]
                    a_temp[j]=a[i*maxstep+j]
                length = length - maxstep
            e_data.append(e_temp)
            a_data.append(a_temp)
    
    return np.concatenate((np.array(e_data).astype(float),np.array(a_data).astype(float)),axis=2)

In [14]:
import _pickle as pickle
allFoldSeq=np.load('data/assist09/raw/allFoldSeq.npy',allow_pickle=True)
MAX_STEP = 128
i=1
for train_sequences,test_sequences in allFoldSeq:
    train_data=splitToMaxStep(train_sequences,MAX_STEP)
    test_data=splitToMaxStep(test_sequences,MAX_STEP)
    
    trainFile=open('data/assist09/raw/train/train_data'+'_'+str(i)+'.txt','wb')
    testFile=open('data/assist09/raw/test/test_data'+'_'+str(i)+'.txt','wb')
    pickle.dump(train_data,trainFile)
    pickle.dump(test_data,testFile)
    trainFile.close()
    testFile.close()
    i=i+1


splitting into MaxStep: 100%|██████████| 3746/3746 [00:00<00:00, 16641.97it/s]
splitting into MaxStep: 100%|██████████| 417/417 [00:00<00:00, 18644.53it/s]
splitting into MaxStep: 100%|██████████| 3746/3746 [00:00<00:00, 17314.17it/s]
splitting into MaxStep: 100%|██████████| 417/417 [00:00<00:00, 14834.31it/s]
splitting into MaxStep: 100%|██████████| 3746/3746 [00:00<00:00, 16719.02it/s]
splitting into MaxStep: 100%|██████████| 417/417 [00:00<00:00, 16959.09it/s]
splitting into MaxStep: 100%|██████████| 3747/3747 [00:00<00:00, 16911.92it/s]
splitting into MaxStep: 100%|██████████| 416/416 [00:00<00:00, 15344.03it/s]
splitting into MaxStep: 100%|██████████| 3747/3747 [00:00<00:00, 16944.92it/s]
splitting into MaxStep: 100%|██████████| 416/416 [00:00<00:00, 18391.42it/s]
splitting into MaxStep: 100%|██████████| 3747/3747 [00:00<00:00, 16790.05it/s]
splitting into MaxStep: 100%|██████████| 416/416 [00:00<00:00, 14598.53it/s]
splitting into MaxStep: 100%|██████████| 3747/3747 [00:00<00:00,