In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_type = {
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'answered_correctly':'int8',
    'prior_question_elapsed_time': 'float64',
    'prior_question_had_explanation': 'boolean'
}
question_type = {
    'question_id': 'int16',
    'part': 'int8'
}
train_df = pd.read_csv('../data/train.csv', usecols=[1, 2, 3, 4, 5, 7, 8, 9], dtype=train_type)
lecture_df = pd.read_csv('../data/lectures.csv')
question_df = pd.read_csv('../data/questions.csv', usecols=[0, 3], dtype=question_type)

# lecture

In [3]:
lecture_df['type_of'] = lecture_df['type_of'].replace('solving question', 'solving_question')
lecture_df = pd.get_dummies(lecture_df, columns=['part', 'type_of'])
lecture_df.head()

Unnamed: 0,lecture_id,tag,part_1,part_2,part_3,part_4,part_5,part_6,part_7,type_of_concept,type_of_intention,type_of_solving_question,type_of_starter
0,89,159,0,0,0,0,1,0,0,1,0,0,0
1,100,70,1,0,0,0,0,0,0,1,0,0,0
2,185,45,0,0,0,0,0,1,0,1,0,0,0
3,192,79,0,0,0,0,1,0,0,0,0,1,0
4,317,156,0,0,0,0,1,0,0,0,0,1,0


In [4]:
train_lecture = train_df[train_df.content_type_id == True].merge(lecture_df, left_on='content_id', right_on='lecture_id', how='left')
train_lecture.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,tag,...,part_2,part_3,part_4,part_5,part_6,part_7,type_of_concept,type_of_intention,type_of_solving_question,type_of_starter
0,653762,2746,6808,1,14,-1,,False,6808,129,...,1,0,0,0,0,0,0,1,0,0
1,10183847,5382,16736,1,21,-1,,False,16736,40,...,0,0,0,0,0,0,1,0,0,0
2,1424348597,5382,30207,1,104,-1,,False,30207,43,...,0,0,0,1,0,0,1,0,0,0
3,1425557777,5382,18545,1,121,-1,,False,18545,58,...,0,0,0,1,0,0,1,0,0,0
4,405813029,8623,10540,1,59,-1,,False,10540,99,...,0,0,0,0,0,0,1,0,0,0


In [9]:
train_lecture = train_lecture.sort_values('timestamp').reset_index(drop=True)
train_lecture.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,tag,...,part_2,part_3,part_4,part_5,part_6,part_7,type_of_concept,type_of_intention,type_of_solving_question,type_of_starter
0,0,2094164160,9896,1,0,-1,,False,9896,167,...,0,0,0,1,0,0,1,0,0,0
1,0,1674321171,2327,1,2,-1,,False,2327,178,...,0,0,0,0,0,0,1,0,0,0
2,0,1587249587,21411,1,1,-1,,False,21411,163,...,1,0,0,0,0,0,0,0,1,0
3,0,1962849778,21852,1,0,-1,,False,21852,17,...,1,0,0,0,0,0,1,0,0,0
4,0,1815165188,6137,1,0,-1,,False,6137,25,...,0,0,0,1,0,0,1,0,0,0


In [5]:
# feature extract example
user_lecture_stats_part = train_lecture.groupby('user_id')[lecture_df.columns[2:]].sum()
for column in user_lecture_stats_part.columns:
    bool_column = column + '_boolean'
    user_lecture_stats_part[bool_column] = (user_lecture_stats_part[column] > 0).astype(int)
user_lecture_stats_part.head()

Unnamed: 0_level_0,part_1,part_2,part_3,part_4,part_5,part_6,part_7,type_of_concept,type_of_intention,type_of_solving_question,...,part_2_boolean,part_3_boolean,part_4_boolean,part_5_boolean,part_6_boolean,part_7_boolean,type_of_concept_boolean,type_of_intention_boolean,type_of_solving_question_boolean,type_of_starter_boolean
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2746,0,1,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
5382,1,0,0,0,2,0,0,3,0,0,...,0,0,0,1,0,0,1,0,0,0
8623,2,1,0,0,0,0,0,3,0,0,...,1,0,0,0,0,0,1,0,0,0
12741,0,0,0,3,0,1,2,4,0,2,...,0,0,1,0,1,1,1,0,1,0
13134,1,3,0,0,3,0,0,6,1,0,...,1,0,0,1,0,0,1,1,0,0


# question

In [6]:
question_df = pd.get_dummies(question_df, columns=['part'])
question_df.head()

Unnamed: 0,question_id,part_1,part_2,part_3,part_4,part_5,part_6,part_7
0,0,1,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0
2,2,1,0,0,0,0,0,0
3,3,1,0,0,0,0,0,0
4,4,1,0,0,0,0,0,0


In [7]:
train_question = train_df[train_df.content_type_id == False].merge(question_df, left_on='content_id', right_on='question_id', how='left')
train_question.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,part_1,part_2,part_3,part_4,part_5,part_6,part_7
0,0,115,5692,0,1,1,,,5692,0,0,0,0,1,0,0
1,56943,115,5716,0,2,1,37000.0,False,5716,0,0,0,0,1,0,0
2,118363,115,128,0,0,1,55000.0,False,128,1,0,0,0,0,0,0
3,131167,115,7860,0,3,1,19000.0,False,7860,1,0,0,0,0,0,0
4,137965,115,7922,0,4,1,11000.0,False,7922,1,0,0,0,0,0,0


In [8]:
train_question = train_question.sort_values('timestamp').reset_index(drop=True)
train_question.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,part_1,part_2,part_3,part_4,part_5,part_6,part_7
0,0,115,5692,0,1,1,,,5692,0,0,0,0,1,0,0
1,0,1805962620,5547,0,0,0,,,5547,0,0,0,0,1,0,0
2,0,2015251289,4024,0,0,1,,,4024,0,0,0,0,1,0,0
3,0,867941388,6659,0,0,1,,,6659,0,0,0,0,1,0,0
4,0,867946278,3977,0,0,1,,,3977,0,0,0,0,1,0,0


In [10]:
# feature extract example
user_answered_accuracy = train_question[['user_id', 'answered_correctly']].groupby('user_id').mean()
user_answered_accuracy.head()

Unnamed: 0_level_0,answered_correctly
user_id,Unnamed: 1_level_1
115,0.695652
124,0.233333
2746,0.578947
5382,0.672
8623,0.642202
