In [1]:
import json
import numpy as np
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

from tqdm import tqdm_notebook,tqdm
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
import gc

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LinearRegression

import seaborn as sns
import matplotlib.pyplot as plt

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


# 数据预处理

In [2]:
with open(r'./input/questions.json','r') as f:
    questions_info = json.load(f)
with open("input/keyid2idx.json",'r') as f:
    keyid2idx = json.load(f)
keyidx2id = {
    "questions":dict(zip(keyid2idx['questions'].values(),keyid2idx['questions'].keys())),
    "concepts":dict(zip(keyid2idx['concepts'].values(),keyid2idx['concepts'].keys()))
}
df = pd.read_csv('input/train_valid_sequences.csv')
df_test = pd.read_csv('input/pykt_test.csv')

In [3]:
# concept、question字段还原为原来的
def reverse_raw(df):
    df['questions_raw'] = df['questions'].apply(lambda x:','.join([keyidx2id['questions'][int(xx)] if xx!='-1' else '-1' for xx in x.split(',')]))
    df['concepts'] = df['concepts'].apply(lambda x:','.join([keyidx2id['concepts'][int(xx)] if xx!='-1' else '-1' for xx in x.split(',')]))
    return df 
df = reverse_raw(df)
df_test = reverse_raw(df_test)

## 数据encoding

In [6]:

# 获取concept的encoding
concept_encoding_dict = {}  # {'id':encoding_id}
concept_freq_dict = {}
question_freq_dict = {}
for _, row in tqdm_notebook(df.iterrows()):
    for ii, (question, concept, response, is_repeat, timestamp) in enumerate(zip(row['questions'].split(","),
                                               row['concepts'].split(","),
                                               row['responses'].split(","),
                                               row['is_repeat'].split(","),row['timestamps'].split(","))):
        if response == "-1":#remove the padding
            break
        if is_repeat != '1':
            if question not in question_freq_dict:
                question_freq_dict[question] = 0
            question_freq_dict[question] += 1
        if concept not in concept_freq_dict:
            concept_freq_dict[concept] = 0
        concept_freq_dict[concept] += 1
for i, concept in enumerate(sorted([(v, freq) for v, freq in concept_freq_dict.items()], key=lambda x:x[1], reverse=True)):
    concept_encoding_dict[concept[0]] = i


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for _, row in tqdm_notebook(df.iterrows()):


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [7]:
# 获取KC的encoding
kc_encoding_dict = {}
kc_freq_dict = {}

for question, values in questions_info.items():
    kcs = list(set([kc for kcs in values['concept_routes'] for kc in kcs.split('----')]))
    for kc in kcs:
        if kc not in kc_freq_dict:
            kc_freq_dict[kc] = 0
        if question not in question_freq_dict:
            question_freq_dict[question] = 1
        kc_freq_dict[kc] += question_freq_dict[question]
    
for i, kc in enumerate(sorted([(v, freq) for v, freq in kc_freq_dict.items()], key=lambda x:x[1], reverse=True)):
    kc_encoding_dict[kc[0]] = i

## 数据划分为每个uid, question一行

In [8]:
def flatten_dataset(df, is_test=False):
    no_this_questions = []
    def save_uid_question(interaction_list, last_one):
        # 新的一个question出现时，把旧的处理保存，开始记录新的
        
        # 处理concept
        last_one['concept_cnt'] = len(last_one['concepts'])
        last_one['concept_hot_cnt'] = len([x for x in last_one['concepts'] if concept_freq_dict[x]>50])
        last_one['concepts_raw'] = last_one['concepts']
        last_one['concepts'] = [kc_encoding_dict[x[1]] for x in sorted([(concept_freq_dict[x],x) for x in last_one['concepts']], reverse=True)]
        last_one['concept_1'] = last_one['concepts'][0]
        last_one['concept_2'] = last_one['concepts'][1] if len(last_one['concepts']) > 1 else -1
        last_one['concept_3'] = last_one['concepts'][2] if len(last_one['concepts']) > 2 else -1
        last_one['concept_4'] = last_one['concepts'][3] if len(last_one['concepts']) > 3 else -1
        last_one['concept_5'] = last_one['concepts'][4] if len(last_one['concepts']) > 4 else -1
        last_one['concept_6'] = last_one['concepts'][5] if len(last_one['concepts']) > 5 else -1
        last_one['concepts'] = str(last_one['concepts']).replace(' ','')
        
        # 处理question info
        last_one['content_cnt'] = len(last_one['content'])
        last_one['kc_group_cnt'] = len(last_one['kc'])
        kcs = [[kc_encoding_dict[kc] for kc in kcs.split('----')] for kcs in last_one['kc']]
        kcs_main = [kc_g[0] for kc_g in kcs]
        last_one['kc_cnt'] = len([kc for kc_g in kcs for kc in kc_g])
        last_one['kc_1'] = kcs_main[0] if len(kcs_main) > 0 else -1
        last_one['kc_2'] = kcs_main[1] if len(kcs_main) > 1 else -1
        last_one['kc_3'] = kcs_main[2] if len(kcs_main) > 2 else -1
        last_one['kc_4'] = kcs_main[3] if len(kcs_main) > 3 else -1
        last_one['kc_5'] = kcs_main[4] if len(kcs_main) > 4 else -1
        last_one['kc_6'] = kcs_main[5] if len(kcs_main) > 5 else -1
        last_one['kc_7'] = kcs_main[6] if len(kcs_main) > 6 else -1
        last_one['kc_8'] = kcs_main[7] if len(kcs_main) > 7 else -1
        last_one['analysis_cnt'] = len(last_one['analysis'])
        last_one['content'] = str(last_one['content']).replace(' ','')
        last_one['kc'] = str(kcs).replace(' ','')
        last_one['analysis'] = str(last_one['analysis']).replace(' ','')
        
        interaction_list.append(last_one.copy())
        
    interaction_list = []
    last_uid = -1
    for _, row in tqdm_notebook(df.iterrows()):
        uid = row['uid']
        if uid != last_uid:
            if last_uid != -1:
                save_uid_question(interaction_list, last_one)
            last_one = {'uid':uid}
            last_uid = uid
        for i, (question, question_raw, concept, response, is_repeat,timestamp) in enumerate(zip(row['questions'].split(","),row['questions_raw'].split(","),
                                               row['concepts'].split(","),
                                               row['responses'].split(","),
                                               row['is_repeat'].split(","),row['timestamps'].split(","))):
            if not is_test and response == "-1":  # remove the padding
                break
            if is_repeat != '1':  # 新的一个question出现时，把旧的保存，开始记录新的
                if 'concepts' in last_one:  # 非第一条数据，则需要保存之前的数据
                    save_uid_question(interaction_list, last_one)
                last_one['question'] = int(question)
                last_one['concepts'] = [concept]
                last_one['response'] = int(response)
                last_one['timestamp'] = int(timestamp)
                try:
                    question_info = questions_info[question_raw]
                except:
                    question_info = {'content':[],'concept_routes':[],'analysis':[],'type':-1}
                    no_this_questions.append((uid, question))
                last_one['content'] = question_info['content']
                last_one['kc'] = question_info['concept_routes']
                last_one['analysis'] = question_info['analysis']
                last_one['type'] = int(question_info['type'])
            else:
                try:
                    last_one['concepts'].append(concept)
                except:
                    print(uid, concept)
    save_uid_question(interaction_list, last_one)
                
    interaction_list = sorted(interaction_list, key=lambda x: (x['uid'],x['timestamp'],x['question']))  # 按时间顺序排列
    df_interaction = pd.DataFrame(interaction_list)
    print('no_this_questions:',no_this_questions)
    print(f"# interaction is {len(interaction_list)}")
    return df_interaction
df = flatten_dataset(df)
df.to_feather('./input/df_train.feather')
del df
gc.collect()
df_test = flatten_dataset(df_test, is_test=True)
df_test.to_feather('./input/df_test.feather')
gc.collect()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for _, row in tqdm_notebook(df.iterrows()):


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


no_this_questions: []
# interaction is 4446736


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


no_this_questions: []
# interaction is 1102810


0

In [9]:
df_test

Unnamed: 0,uid,question,concepts,response,timestamp,content,kc,analysis,type,concept_cnt,concept_hot_cnt,concepts_raw,concept_1,concept_2,concept_3,concept_4,concept_5,concept_6,content_cnt,kc_group_cnt,kc_cnt,kc_1,kc_2,kc_3,kc_4,kc_5,kc_6,kc_7,kc_8,analysis_cnt
0,2,451,[71],1,1594558006000,"[1161,2441,1919,893,2599,56,1781,1198,375,2315...","[[0,3,57,70,71]]","[422,1651,1912,2135,354,2405,207,893,2197,2718...",1,1,1,[1978],71,-1,-1,-1,-1,-1,72,1,5,0,-1,-1,-1,-1,-1,-1,-1,158
1,2,452,[85],0,1594558557000,"[1594,1667,960,2230,1018,1524,2599,2100,1814,1...","[[0,4,26,46,85]]","[2039,1667,2621,1429,1777,1651,1285,1667,2718,...",0,1,1,[2365],85,-1,-1,-1,-1,-1,94,1,5,0,-1,-1,-1,-1,-1,-1,-1,68
2,2,453,[773],1,1594642526000,"[1432,2526,2345,832,832,1940,1104,832,832,2718...","[[0,3,13,171,177,773]]","[832,832,1363,1886,832,832,2718,832,832,1363,1...",0,1,1,[141],773,-1,-1,-1,-1,-1,130,1,6,0,-1,-1,-1,-1,-1,-1,-1,41
3,2,454,[773],1,1594642930000,"[1432,2526,2345,832,832,1940,1104,832,832,2718...","[[0,3,13,171,177,773]]","[375,1123,2011,898,2345,789,832,832,1363,1886,...",0,1,1,[141],773,-1,-1,-1,-1,-1,128,1,6,0,-1,-1,-1,-1,-1,-1,-1,115
4,2,455,[88],1,1594643736000,"[2637,417,1349,1161,2671,1264,1494,39,2718,127...","[[0,3,13,50,81,88]]","[832,832,1091,384,1940,208,1940,208,1232,384,1...",0,1,1,[328],88,-1,-1,-1,-1,-1,51,1,6,0,-1,-1,-1,-1,-1,-1,-1,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1102805,18065,988,[86],-1,1611818774000,"[2538,916,1779,1968,220,354,2408,372,2718,1765...","[[0,1,19,33,39,86]]","[2127,2683,1105,960,296,2408,372,2718,2685,461...",1,1,1,[1656],86,-1,-1,-1,-1,-1,85,1,6,0,-1,-1,-1,-1,-1,-1,-1,96
1102806,18065,989,[356],-1,1611818774000,"[935,935,273,941,490,2718,2065,824,2348,2441,2...","[[11,14,15,19,356]]","[2344,1343,1907,249,2348,2599,178,2504,2599,14...",1,1,1,[235],356,-1,-1,-1,-1,-1,93,1,5,11,-1,-1,-1,-1,-1,-1,-1,165
1102807,18065,1001,[48],-1,1611818861000,"[2441,275,2345,1082,1017,1592,342,1198,832,832...","[[0,3,35,48]]","[342,1198,832,832,1104,1363,208,1104,1104,1452...",0,1,1,[297],48,-1,-1,-1,-1,-1,91,1,4,0,-1,-1,-1,-1,-1,-1,-1,25
1102808,18065,1003,[48],-1,1611818861000,"[1082,1550,2485,594,364,756,1094,119,1302,2217...","[[0,3,35,48]]","[832,1940,1104,208,1940,1886,1452,1104,980,384...",0,1,1,[297],48,-1,-1,-1,-1,-1,95,1,4,0,-1,-1,-1,-1,-1,-1,-1,15
