In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn

from catboost import CatBoostClassifier

from sklearn.metrics import roc_curve, auc, precision_score, recall_score, accuracy_score, \
        PrecisionRecallDisplay, average_precision_score, precision_recall_curve
from sklearn.utils import shuffle
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

import json
from networkx.readwrite import json_graph
import networkx as nx

from tqdm.notebook import tqdm, trange

In [2]:
%%time
date_cols = ['attempts_date_created', 'cl_date_assignment', 'cls_date_created']
df = pd.read_csv('wide_math.csv', parse_dates=date_cols, index_col=0)

CPU times: user 2min 15s, sys: 1.66 s, total: 2min 16s
Wall time: 2min 16s


In [3]:
# df = df.sort_values(by='attempts_date_created').reset_index(drop=True)
# df = df.groupby(by=['cls_student_id', 'problem_id'], as_index=False).first()
# df = df.sort_values(by='attempts_date_created').reset_index(drop=True)
print(df.shape)
df.head(2)

(5645888, 13)


Unnamed: 0,assignment_level,attempts_date_created,cl_date_assignment,cl_id,cls_date_created,cls_student_id,course_id,is_solved,problem_id,subject_slug,team_id,team_level,tp_teacher_id
0,3,2022-03-08 11:02:37+03:00,2022-03-05 13:00:00+03:00,71374307,2022-03-08 10:52:43+03:00,1650006,5096626,1,97304,mathematics,82516,4,520541
1,3,2022-03-08 11:02:49+03:00,2021-09-02 14:23:50+03:00,70879797,2022-03-08 11:02:08+03:00,1494142,5099807,1,227546,mathematics,69290,4,615530


In [4]:
df['execution_time'] = df['attempts_date_created'] - df['cls_date_created']
df['execution_time'] = df['execution_time'].dt.total_seconds()

In [5]:
df = df[['cls_student_id', 'problem_id', 'assignment_level',
       'attempts_date_created', 'is_solved', 'execution_time']]

In [6]:
skill_info = pd.read_excel('markup_math.xlsx', index_col=0)
skills = set(skill_info['skills'])

In [7]:
problems_has_themes = set(skill_info['problem_id'])
df = df[df['problem_id'].apply(lambda x: x in problems_has_themes)]
df = df.reset_index(drop=True)
print(df.shape)
df.head(2)

(4639383, 6)


Unnamed: 0,cls_student_id,problem_id,assignment_level,attempts_date_created,is_solved,execution_time
0,1650006,97304,3,2022-03-08 11:02:37+03:00,1,594.0
1,1494142,227546,3,2022-03-08 11:02:49+03:00,1,41.0


In [8]:
def get_train_test(df):
    df = shuffle(df, random_state=0)
    
    med_date = df['attempts_date_created'].median()
    med_user = df['cls_student_id'].median()
    
    test_index = (df['cls_student_id'] > med_user) & (df['attempts_date_created'] > med_date)
    train_index = ~test_index
    
    df_train = df[train_index]
    df_test = df[test_index]
    
    train_problems = set(df_train['problem_id'])
    train_students = set(df_train['cls_student_id'])
    
    df_test = df_test[df_test['problem_id'].apply(lambda x: x in train_problems)]
    df_test = df_test[df_test['cls_student_id'].apply(lambda x: x in train_students)]
    
    return df_train, df_test

In [9]:
def get_metrics(target, pred, pred_score):
    result = {
        "precision":0,
        "recall": 0,
        "acc": 0,
        "roc_auc": 0, 
        "ap": 0,
        "roc1" : [],
        "roc2" : [],
        "pr1" : [], 
        "pr2" : [],
    }
    
    result['precision'] = precision_score(target, pred)
    result['recall'] = recall_score(target, pred)
    result['acc'] = accuracy_score(target, pred)
    result['roc1'], result['roc2'], _ = roc_curve(target, pred_score)
    result['roc_auc'] = auc(result['roc1'], result['roc2'])
    result['pr1'], result['pr2'], _ = precision_recall_curve(target, pred_score)
    result['ap'] = average_precision_score(target, pred_score)
    
    return result

In [10]:
class ConstModel:
    def fit(self, X, y):
        return self
    
    def predict(self, X):
        return np.ones(X.shape[0])

In [11]:
df_train, df_test = get_train_test(df)

In [12]:
X_train, y_train = df_train.drop(['is_solved'], axis=1), df_train['is_solved']
X_test, y_test = df_test.drop(['is_solved'], axis=1), df_test['is_solved']

In [13]:
all_model_info = dict()

def get_info():
    a = pd.DataFrame.from_dict(all_model_info.values())
    a.index = list(all_model_info.keys())
    return a[['acc', 'roc_auc', 'ap', 'precision', 'recall']]

In [14]:
const_model = ConstModel()
pred = const_model.predict(X_test)
all_model_info['const'] = get_metrics(y_test, pred, pred)
get_info()

Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0


In [15]:
class LinModel(nn.Module):
    def __init__(self, n_students, n_problems, emb_size=128):
        super().__init__()
        
        self.n_students = n_students
        self.n_problems = n_problems
        
        self.stud_embed = nn.Embedding(n_students, emb_size)
        self.problem_embed = nn.Embedding(n_problems, emb_size)
        self.head = nn.Sequential(
            nn.Linear(emb_size * 2, emb_size),
            nn.Tanh(),
            nn.Linear(emb_size, 2),
            nn.Softmax(dim=-1),
        )
        
        self.problem_to_index = None
        self.st_to_index = None
        
    def forward(self, X):
        problems = X[:, 0]
        students = X[:, 1]
        
        problems = self.problem_embed(problems)
        students = self.stud_embed(students)
        
        x = torch.cat((students, problems), dim=-1)    
        return self.head(x)
    
    def convert_X(self, X):
        X = X.copy()
        X['cls_student_id'] = X['cls_student_id'].apply(lambda x: self.st_to_index[x])
        X['problem_id'] = X['problem_id'].apply(lambda x: self.problem_to_index[x])
        
        return X[['problem_id', 'cls_student_id']].to_numpy()
    
    def fit(self, X, y, optimizer, criterion, device, epochs=101):
        students_set = set(X['cls_student_id'])
        problems_set = set(X['problem_id'])
        assert len(students_set) == self.n_students
        assert len(problems_set) == self.n_problems
        
        self.problem_to_index = {pr_id: i for i, pr_id in enumerate(problems_set)}
        self.st_to_index = {st_id: i for i, st_id in enumerate(students_set)}
        
        X = self.convert_X(X)
        X = torch.IntTensor(X).to(device)
        y = torch.LongTensor(y).to(device)
        
        for epoch in trange(epochs):
            y_pred = self(X)
            loss = criterion(y_pred, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (epoch + 1) % (epochs / 10) == 0:
                print(f"epoch: {epoch:02}, loss: {loss.item():.3}")
            
    @torch.no_grad()
    def predict_proba(self, X, device):
        X = self.convert_X(X)
        X = torch.IntTensor(X).to(device)
        
        return self(X).cpu().numpy()
                
    def predict(self, X, device):
        proba = self.predict_proba(X, device)
        return np.argmax(proba, axis=-1)
        
        

In [16]:
problems = set(X_train['problem_id'])
students = set(X_train['cls_student_id'])

assert not set(X_test['problem_id']).difference(problems)
assert not set(X_test['cls_student_id']).difference(students)

In [17]:
device = 'cuda'
lin_model = LinModel(len(students), len(problems)).to(device)
optimizer = torch.optim.Adam(lin_model.parameters())
criterion = nn.CrossEntropyLoss()

In [18]:
lin_model.fit(X_train, y_train.to_numpy(), optimizer, criterion, device, epochs=200)

  0%|          | 0/200 [00:00<?, ?it/s]

epoch: 19, loss: 0.663
epoch: 39, loss: 0.636
epoch: 59, loss: 0.611
epoch: 79, loss: 0.59
epoch: 99, loss: 0.574
epoch: 119, loss: 0.562
epoch: 139, loss: 0.553
epoch: 159, loss: 0.547
epoch: 179, loss: 0.543
epoch: 199, loss: 0.54


In [19]:
pred_proba = lin_model.predict_proba(X_test, device)
pred = lin_model.predict(X_test, device)
all_model_info['lin_model'] = get_metrics(y_test, pred, pred_proba[:, 1])
get_info()

Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
lin_model,0.707778,0.756269,0.801462,0.72479,0.821406


In [20]:
class AlsModel(nn.Module):
    def __init__(self, n_students, n_problems, emb_size=128):
        super().__init__()
        
        self.n_students = n_students
        self.n_problems = n_problems
        
        self.stud_embed = nn.Embedding(n_students, emb_size, max_norm=1/emb_size**0.5)
        self.problem_embed = nn.Embedding(n_problems, emb_size, max_norm=1/emb_size**0.5)
        
        self.problem_to_index = None
        self.st_to_index = None
        
    def forward(self, X):
        problems = X[:, 0]
        students = X[:, 1]
        
        problems = self.problem_embed(problems)
        students = self.stud_embed(students)
          
        return torch.sigmoid((problems * students).sum(dim=-1))
    
    def convert_X(self, X):
        X = X.copy()
        X['cls_student_id'] = X['cls_student_id'].apply(lambda x: self.st_to_index[x])
        X['problem_id'] = X['problem_id'].apply(lambda x: self.problem_to_index[x])
        
        return X[['problem_id', 'cls_student_id']].to_numpy()
    
    def fit(self, X, y, optimizer_stud, optimizer_problem, criterion, device, epochs=101):
        students_set = set(X['cls_student_id'])
        problems_set = set(X['problem_id'])
        assert len(students_set) == self.n_students
        assert len(problems_set) == self.n_problems
        
        self.problem_to_index = {pr_id: i for i, pr_id in enumerate(problems_set)}
        self.st_to_index = {st_id: i for i, st_id in enumerate(students_set)}
        
        X = self.convert_X(X)
        X = torch.IntTensor(X).to(device)
        y = torch.Tensor(y).to(device)
        
        for epoch in trange(epochs):
            y_pred = self(X)
            loss = criterion(y_pred, y)
            
            optimizer_stud.zero_grad()
            loss.backward()
            optimizer_stud.step()
            
            y_pred = self(X)
            loss = criterion(y_pred, y)
            
            optimizer_problem.zero_grad()
            loss.backward()
            optimizer_problem.step()
            
            if (epoch + 1) % (epochs / 10) == 0:
                print(f"epoch: {epoch:02}, loss: {loss.item():.3}")
            
    @torch.no_grad()
    def predict_proba(self, X, device):
        X = self.convert_X(X)
        X = torch.IntTensor(X).to(device)
        
        return self(X).cpu().numpy()
                
    def predict(self, X, device):
        proba = self.predict_proba(X, device)
        return proba > 0.5

In [21]:
device = 'cuda'
als_model = AlsModel(len(students), len(problems)).to(device)
optimizer_stud = torch.optim.Adam(lin_model.stud_embed.parameters())
optimizer_problem = torch.optim.Adam(lin_model.problem_embed.parameters())
criterion = nn.L1Loss()

In [22]:
als_model.fit(X_train, y_train.to_numpy(), optimizer_stud, optimizer_problem, criterion, device, epochs=100)

  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 09, loss: 0.5
epoch: 19, loss: 0.5
epoch: 29, loss: 0.5
epoch: 39, loss: 0.5
epoch: 49, loss: 0.5
epoch: 59, loss: 0.5
epoch: 69, loss: 0.5
epoch: 79, loss: 0.5
epoch: 89, loss: 0.5
epoch: 99, loss: 0.5


In [23]:
pred_proba = als_model.predict_proba(X_test, device)
pred = als_model.predict(X_test, device)
all_model_info['als_model'] = get_metrics(y_test, pred, pred_proba)
get_info()

Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
lin_model,0.707778,0.756269,0.801462,0.72479,0.821406
als_model,0.499735,0.499208,0.594793,0.595444,0.500069


In [24]:
problem_info = df_train.groupby(by=['problem_id'])[['is_solved']].mean()
problem_info.columns = ['mean_problem_solved']
students_info = df_train.groupby(by=['cls_student_id'])[['is_solved']].mean()
students_info.columns = ['mean_student_solved']
problem_info['execution_time'] = df_train.groupby(by=['problem_id'])['execution_time'].median()
students_info['execution_time'] = df_train.groupby(by=['cls_student_id'])['execution_time'].median()

difficulty_math = pd.read_excel('ctt_simple.xlsx')
difficulty_math = difficulty_math[1:]
difficulty_math['problem_id'] = difficulty_math['problem_id'].astype(int)
difficulty_math = difficulty_math.set_index(['problem_id'])
difficulty_math['difficulty'] = difficulty_math['difficulty'].astype(float)
difficulty_math = difficulty_math.groupby(by='problem_id')['difficulty'].mean()
difficulty_math.head()

problem_info = problem_info.join(difficulty_math)
problem_info['has_diff_info'] = ((problem_info['difficulty'].isna()) | (problem_info['difficulty'] == 0)).astype(int)
problem_info['difficulty'] = problem_info['difficulty'].fillna(0)
problem_info.head(3)

Unnamed: 0_level_0,mean_problem_solved,execution_time,difficulty,has_diff_info
problem_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
31964,0.666667,417.0,0.336907,0
31980,0.695652,71.0,0.079576,0
31988,0.55814,1345.5,0.409384,0


In [25]:
df = df.join(problem_info, on='problem_id', rsuffix='_problem')
df = df.join(students_info, on='cls_student_id', rsuffix='_student')

In [26]:
not_need_cols = ['cls_student_id', 'problem_id', 'attempts_date_created', 'is_solved','execution_time']
df_train, df_test = df.iloc[df_train.index], df.iloc[df_test.index]
df_train = df_train.dropna()
df_test = df_test.dropna()

In [27]:
X_train, y_train = df_train.drop(not_need_cols, axis=1), df_train['is_solved']
X_test, y_test = df_test.drop(not_need_cols, axis=1), df_test['is_solved']

In [28]:
clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3, loss='log_loss'))
clf.fit(X_train, y_train)

pred_proba = clf.predict_proba(X_test)[:,1]
pred = clf.predict(X_test)
all_model_info['SGD'] = get_metrics(y_test, pred, pred_proba)
get_info()

Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
lin_model,0.707778,0.756269,0.801462,0.72479,0.821406
als_model,0.499735,0.499208,0.594793,0.595444,0.500069
SGD,0.707361,0.765357,0.816738,0.713915,0.849045


In [29]:
cat_class = CatBoostClassifier(iterations=100, depth=10,
                           task_type="GPU",
                           devices='0:1', logging_level='Silent')
cat_class.fit(X_train, y_train)
pred_proba = cat_class.predict_proba(X_test)[:,1]
pred = cat_class.predict(X_test)
all_model_info['cat_boost'] = get_metrics(y_test, pred, pred_proba)
get_info()



Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
lin_model,0.707778,0.756269,0.801462,0.72479,0.821406
als_model,0.499735,0.499208,0.594793,0.595444,0.500069
SGD,0.707361,0.765357,0.816738,0.713915,0.849045
cat_boost,0.70821,0.765531,0.813043,0.715849,0.846079


In [30]:
%%time
forest = RandomForestClassifier(n_estimators=50, max_depth=8, n_jobs=12, random_state=0)
forest.fit(X_train, y_train)
pred_proba = forest.predict_proba(X_test)[:, 1]
pred = forest.predict(X_test)
all_model_info['forest'] = get_metrics(y_test, pred, pred_proba)
get_info()

CPU times: user 6min 46s, sys: 2.25 s, total: 6min 48s
Wall time: 39.7 s


Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
lin_model,0.707778,0.756269,0.801462,0.72479,0.821406
als_model,0.499735,0.499208,0.594793,0.595444,0.500069
SGD,0.707361,0.765357,0.816738,0.713915,0.849045
cat_boost,0.70821,0.765531,0.813043,0.715849,0.846079
forest,0.707946,0.768641,0.821779,0.71312,0.852906


In [31]:
with open('graph_math') as f:
    graph_data = json.loads(f.read())

G = json_graph.node_link_graph(graph_data)
classes = set(" ".join(a.strip().split()) for a in G.nodes)

In [32]:
len(G.nodes),len(skills)
diff = classes.difference(skills)
for v in diff:
    for in_v, _ in G.in_edges(v):
        for _, out_v in G.out_edges(v):
            if in_v == v:
                print('aaaaaaa')
            if out_v == v:
                print('aaaaaaa')
            if not G.has_edge(in_v, out_v):
                G.add_edge(in_v, out_v)
    G.remove_node(v)

In [33]:
df = df.sort_values(by=['attempts_date_created']).reset_index(drop=True)
df['att_id'] = df.reset_index()['index']

In [34]:
from collections import defaultdict

problem_to_skill = defaultdict(list)
skill_info.apply(lambda x: problem_to_skill[x['problem_id']].append(x['skills']), axis=1)
problem_to_skill = dict(problem_to_skill)

In [35]:
from dataclasses import dataclass, field

@dataclass
class StudInfo:
    skill_attemps: defaultdict[str, set] = field(default_factory=lambda:defaultdict(set))
    skill_solved: defaultdict[str, set] = field(default_factory=lambda:defaultdict(set))

In [36]:
stud_info = defaultdict(StudInfo)

In [37]:
def compute_stud_info(row):     
    problem_id = row['problem_id']
    stud_id = row['cls_student_id']
    unic_id = row['att_id']
    
    solved_cur = set()
    attemp_cur = set()
    solved_rec = set()
    attemp_rec = set()
    
    for skill in problem_to_skill[problem_id]:
        attemp_cur = attemp_cur.union(stud_info[stud_id].skill_attemps[skill])
        solved_cur = solved_cur.union(stud_info[stud_id].skill_solved[skill])
        
        for _, prerec_skill in G.out_edges(skill):
            solved_rec = solved_rec.union(stud_info[stud_id].skill_solved[prerec_skill])
            attemp_rec = attemp_rec.union(stud_info[stud_id].skill_attemps[prerec_skill])
            
            for _, prepre_skill in G.out_edges(prerec_skill):
                solved_rec = solved_rec.union(stud_info[stud_id].skill_solved[prepre_skill])
                attemp_rec = attemp_rec.union(stud_info[stud_id].skill_attemps[prepre_skill])
        
        stud_info[stud_id].skill_attemps[skill].add(unic_id)
        if row['is_solved']:
            stud_info[stud_id].skill_solved[skill].add(unic_id)
            
    solved_cur.discard(unic_id)
    attemp_cur.discard(unic_id)
    solved_rec.discard(unic_id)
    attemp_rec.discard(unic_id)
    
    res = {
        'solved_cur': len(solved_cur),
        'attemp_cur': len(attemp_cur),
        'solved_rec': len(solved_rec),
        'attemp_rec': len(attemp_rec),
    }
    return res

In [38]:
%%time
df['prerec_info'] = df.apply(compute_stud_info, axis=1)

CPU times: user 5min 50s, sys: 3.72 s, total: 5min 53s
Wall time: 5min 53s


In [39]:
df['solved_cur'] =  df['prerec_info'].apply(lambda x: x['solved_cur'] / x['attemp_cur'] if x['attemp_cur'] else 0.5)
df['attemp_cur'] =  df['prerec_info'].apply(lambda x: x['attemp_cur'])
df['solved_rec'] =  df['prerec_info'].apply(lambda x: x['solved_rec'] / x['attemp_rec'] if x['attemp_rec'] else 0.5)
df['attemp_rec'] =  df['prerec_info'].apply(lambda x: x['attemp_rec'])

In [40]:
not_need_cols = ['cls_student_id', 'problem_id', 'attempts_date_created', 
                 'is_solved', 'solved_rec', 'attemp_rec', 'prerec_info', 'att_id', 'execution_time']
df_train, df_test = df.iloc[df_train.index], df.iloc[df_test.index]
df_train = df_train.dropna()
df_test = df_test.dropna()
X_train, y_train = df_train.drop(not_need_cols, axis=1), df_train['is_solved']
X_test, y_test = df_test.drop(not_need_cols, axis=1), df_test['is_solved']

In [41]:
X_train.head(2)

Unnamed: 0,assignment_level,mean_problem_solved,execution_time_problem,difficulty,has_diff_info,mean_student_solved,execution_time_student,solved_cur,attemp_cur
2535889,4,0.380873,69.0,0.662803,0.0,0.623656,152.5,0.5,0
3250975,4,0.512875,562.0,0.4473,0.0,0.671233,1054.5,1.0,1


In [42]:
clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3, loss='log_loss'))
clf.fit(X_train, y_train)

pred_proba = clf.predict_proba(X_test)[:,1]
pred = clf.predict(X_test)
all_model_info['SGD_cur_info'] = get_metrics(y_test, pred, pred_proba)
get_info()

Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
lin_model,0.707778,0.756269,0.801462,0.72479,0.821406
als_model,0.499735,0.499208,0.594793,0.595444,0.500069
SGD,0.707361,0.765357,0.816738,0.713915,0.849045
cat_boost,0.70821,0.765531,0.813043,0.715849,0.846079
forest,0.707946,0.768641,0.821779,0.71312,0.852906
SGD_cur_info,0.74471,0.802405,0.86631,0.767677,0.852765


In [43]:
cat_class = CatBoostClassifier(iterations=1000, depth=10,
                           task_type="GPU",
                           devices='0:1', logging_level='Silent')
cat_class.fit(X_train, y_train)
pred_proba = cat_class.predict_proba(X_test)[:,1]
pred = cat_class.predict(X_test)
all_model_info['cat_boost_cur_info'] = get_metrics(y_test, pred, pred_proba)
get_info()



Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
lin_model,0.707778,0.756269,0.801462,0.72479,0.821406
als_model,0.499735,0.499208,0.594793,0.595444,0.500069
SGD,0.707361,0.765357,0.816738,0.713915,0.849045
cat_boost,0.70821,0.765531,0.813043,0.715849,0.846079
forest,0.707946,0.768641,0.821779,0.71312,0.852906
SGD_cur_info,0.74471,0.802405,0.86631,0.767677,0.852765
cat_boost_cur_info,0.751559,0.812226,0.874222,0.776401,0.850507


In [44]:
%%time
forest = RandomForestClassifier(n_estimators=50, max_depth=8, n_jobs=12, random_state=0)
forest.fit(X_train, y_train)
pred_proba = forest.predict_proba(X_test)[:, 1]
pred = forest.predict(X_test)
all_model_info['forest_cur_info'] = get_metrics(y_test, pred, pred_proba)
get_info()

CPU times: user 8min 17s, sys: 1.65 s, total: 8min 19s
Wall time: 47.8 s


Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
lin_model,0.707778,0.756269,0.801462,0.72479,0.821406
als_model,0.499735,0.499208,0.594793,0.595444,0.500069
SGD,0.707361,0.765357,0.816738,0.713915,0.849045
cat_boost,0.70821,0.765531,0.813043,0.715849,0.846079
forest,0.707946,0.768641,0.821779,0.71312,0.852906
SGD_cur_info,0.74471,0.802405,0.86631,0.767677,0.852765
cat_boost_cur_info,0.751559,0.812226,0.874222,0.776401,0.850507
forest_cur_info,0.746711,0.806212,0.869476,0.770218,0.852068


In [45]:
not_need_cols = ['cls_student_id', 'problem_id', 'attempts_date_created', 
                 'is_solved', 'prerec_info', 'att_id', 'execution_time']
df_train, df_test = df.iloc[df_train.index], df.iloc[df_test.index]
df_train = df_train.dropna()
df_test = df_test.dropna()
X_train, y_train = df_train.drop(not_need_cols, axis=1), df_train['is_solved']
X_test, y_test = df_test.drop(not_need_cols, axis=1), df_test['is_solved']

In [46]:
clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3, loss='log_loss'))
clf.fit(X_train, y_train)

pred_proba = clf.predict_proba(X_test)[:,1]
pred = clf.predict(X_test)
all_model_info['SGD_rec_info'] = get_metrics(y_test, pred, pred_proba)
get_info()

Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
lin_model,0.707778,0.756269,0.801462,0.72479,0.821406
als_model,0.499735,0.499208,0.594793,0.595444,0.500069
SGD,0.707361,0.765357,0.816738,0.713915,0.849045
cat_boost,0.70821,0.765531,0.813043,0.715849,0.846079
forest,0.707946,0.768641,0.821779,0.71312,0.852906
SGD_cur_info,0.74471,0.802405,0.86631,0.767677,0.852765
cat_boost_cur_info,0.751559,0.812226,0.874222,0.776401,0.850507
forest_cur_info,0.746711,0.806212,0.869476,0.770218,0.852068
SGD_rec_info,0.74488,0.802567,0.866534,0.7677,0.853103


In [47]:
torch.cuda.empty_cache()

In [48]:
cat_class = CatBoostClassifier(iterations=1000, depth=10,
                           task_type="GPU",
                           devices='0:1', logging_level='Silent')
cat_class.fit(X_train, y_train)
pred_proba = cat_class.predict_proba(X_test)[:,1]
pred = cat_class.predict(X_test)
all_model_info['cat_boost_rec_info'] = get_metrics(y_test, pred, pred_proba)
get_info()

Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
lin_model,0.707778,0.756269,0.801462,0.72479,0.821406
als_model,0.499735,0.499208,0.594793,0.595444,0.500069
SGD,0.707361,0.765357,0.816738,0.713915,0.849045
cat_boost,0.70821,0.765531,0.813043,0.715849,0.846079
forest,0.707946,0.768641,0.821779,0.71312,0.852906
SGD_cur_info,0.74471,0.802405,0.86631,0.767677,0.852765
cat_boost_cur_info,0.751559,0.812226,0.874222,0.776401,0.850507
forest_cur_info,0.746711,0.806212,0.869476,0.770218,0.852068
SGD_rec_info,0.74488,0.802567,0.866534,0.7677,0.853103


In [49]:
%%time
forest = RandomForestClassifier(n_estimators=50, max_depth=8, n_jobs=12, random_state=0)
forest.fit(X_train, y_train)
pred_proba = forest.predict_proba(X_test)[:, 1]
pred = forest.predict(X_test)
all_model_info['forest_rec_info'] = get_metrics(y_test, pred, pred_proba)
get_info()

CPU times: user 8min 17s, sys: 1.67 s, total: 8min 19s
Wall time: 48.9 s


Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
lin_model,0.707778,0.756269,0.801462,0.72479,0.821406
als_model,0.499735,0.499208,0.594793,0.595444,0.500069
SGD,0.707361,0.765357,0.816738,0.713915,0.849045
cat_boost,0.70821,0.765531,0.813043,0.715849,0.846079
forest,0.707946,0.768641,0.821779,0.71312,0.852906
SGD_cur_info,0.74471,0.802405,0.86631,0.767677,0.852765
cat_boost_cur_info,0.751559,0.812226,0.874222,0.776401,0.850507
forest_cur_info,0.746711,0.806212,0.869476,0.770218,0.852068
SGD_rec_info,0.74488,0.802567,0.866534,0.7677,0.853103


In [50]:
inf = get_info()

In [51]:
inf.index

Index(['const', 'lin_model', 'als_model', 'SGD', 'cat_boost', 'forest',
       'SGD_cur_info', 'cat_boost_cur_info', 'forest_cur_info', 'SGD_rec_info',
       'cat_boost_rec_info', 'forest_rec_info'],
      dtype='object')

In [52]:
inf

Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
lin_model,0.707778,0.756269,0.801462,0.72479,0.821406
als_model,0.499735,0.499208,0.594793,0.595444,0.500069
SGD,0.707361,0.765357,0.816738,0.713915,0.849045
cat_boost,0.70821,0.765531,0.813043,0.715849,0.846079
forest,0.707946,0.768641,0.821779,0.71312,0.852906
SGD_cur_info,0.74471,0.802405,0.86631,0.767677,0.852765
cat_boost_cur_info,0.751559,0.812226,0.874222,0.776401,0.850507
forest_cur_info,0.746711,0.806212,0.869476,0.770218,0.852068
SGD_rec_info,0.74488,0.802567,0.866534,0.7677,0.853103


In [53]:
inf.loc[['cat_boost_cur_info', 'SGD_rec_info', 'cat_boost_rec_info', 'forest_rec_info']]

Unnamed: 0,acc,roc_auc,ap,precision,recall
cat_boost_cur_info,0.751559,0.812226,0.874222,0.776401,0.850507
SGD_rec_info,0.74488,0.802567,0.866534,0.7677,0.853103
cat_boost_rec_info,0.751595,0.812416,0.874382,0.776384,0.850621
forest_rec_info,0.746121,0.805168,0.868472,0.769609,0.851981


In [64]:
inf.loc[['cat_boost', 'SGD_cur_info', 'cat_boost_cur_info', 'forest_cur_info']]

Unnamed: 0,acc,roc_auc,ap,precision,recall
cat_boost,0.709326,0.767129,0.814369,0.716937,0.84621
SGD_cur_info,0.744854,0.802405,0.866254,0.769765,0.848792
cat_boost_cur_info,0.752191,0.813039,0.874639,0.77651,0.851699
forest_cur_info,0.746712,0.805855,0.868967,0.770125,0.852262


In [63]:
inf.loc[['lin_model', 'SGD', 'cat_boost', 'forest']]

Unnamed: 0,acc,roc_auc,ap,precision,recall
lin_model,0.706446,0.755585,0.800646,0.723205,0.821808
SGD,0.706654,0.764553,0.815848,0.712428,0.851217
cat_boost,0.709326,0.767129,0.814369,0.716937,0.84621
forest,0.707874,0.768714,0.821736,0.711912,0.856111


In [39]:
inf

Unnamed: 0,acc,roc_auc,ap,precision,recall
const,0.595775,0.5,0.595775,0.595775,1.0
SGD,0.706652,0.765186,0.816386,0.711504,0.853819
cat_boost,0.709326,0.767129,0.814369,0.716937,0.84621
SGD_cur_info,0.744856,0.802422,0.866351,0.770232,0.847843
cat_boost_cur_info,0.752235,0.813028,0.874625,0.776498,0.85182
SGD_rec_info,0.744746,0.802354,0.866195,0.768051,0.852071
cat_boost_rec_info,0.752186,0.813259,0.874819,0.776538,0.851632


In [54]:
stud_id = 1450857

In [55]:
sdf = df[df['cls_student_id'] == stud_id].copy()

In [58]:
def compute_stud_info_for_one_stud(row):     
    problem_id = row['problem_id']
    unic_id = row['att_id']
    
    solved_cur = set()
    attemp_cur = set()
    solved_rec = set()
    attemp_rec = set()
    
    for skill in problem_to_skill[problem_id]:
#         attemp_cur = attemp_cur.union(stud_info.skill_attemps[skill])
#         solved_cur = solved_cur.union(stud_info.skill_solved[skill])
        
#         for _, prerec_skill in G.out_edges(skill):
#             solved_rec = solved_rec.union(stud_info.skill_solved[prerec_skill])
#             attemp_rec = attemp_rec.union(stud_info.skill_attemps[prerec_skill])
            
#             for _, prepre_skill in G.out_edges(prerec_skill):
#                 solved_rec = solved_rec.union(stud_info.skill_solved[prepre_skill])
#                 attemp_rec = attemp_rec.union(stud_info.skill_attemps[prepre_skill])
        
        stud_info.skill_attemps[skill].add(unic_id)
        if row['is_solved']:
            stud_info.skill_solved[skill].add(unic_id)
            
#     solved_cur.discard(unic_id)
#     attemp_cur.discard(unic_id)
#     solved_rec.discard(unic_id)
#     attemp_rec.discard(unic_id)
    
#     res = {
#         'solved_cur': len(solved_cur),
#         'attemp_cur': len(attemp_cur),
#         'solved_rec': len(solved_rec),
#         'attemp_rec': len(attemp_rec),
#     }
#     return res

In [62]:
%%time
stud_info = StudInfo()
_ = sdf.apply(compute_stud_info_for_one_stud, axis=1)

CPU times: user 82.1 ms, sys: 49 µs, total: 82.1 ms
Wall time: 80.8 ms


In [63]:
s_inf = students_info.loc[[stud_id]]

In [64]:
s_inf = s_inf.reset_index()

In [65]:
train_for_student = problem_info.copy()

In [66]:
for col in s_inf.columns:
    pr_col = col
    if col in train_for_student.columns:
        pr_col =  col + "_stud"
    train_for_student[pr_col] = s_inf[col][0]

In [68]:
train_for_student = train_for_student.reset_index()

In [72]:
print(train_for_student.shape)
train_for_student.head(2)

(18360, 8)


Unnamed: 0,problem_id,mean_problem_solved,execution_time,difficulty,has_diff_info,cls_student_id,mean_student_solved,execution_time_stud
0,31964,0.666667,417.0,0.336907,0,1450857,0.353416,397.0
1,31980,0.695652,71.0,0.079576,0,1450857,0.353416,397.0


In [73]:
def compute_stud_info_for_one_stud_add_inf(row):     
    problem_id = row['problem_id']
    
    solved_cur = set()
    attemp_cur = set()
    solved_rec = set()
    attemp_rec = set()
    
    for skill in problem_to_skill[problem_id]:
        attemp_cur = attemp_cur.union(stud_info.skill_attemps[skill])
        solved_cur = solved_cur.union(stud_info.skill_solved[skill])
        
        for _, prerec_skill in G.out_edges(skill):
            solved_rec = solved_rec.union(stud_info.skill_solved[prerec_skill])
            attemp_rec = attemp_rec.union(stud_info.skill_attemps[prerec_skill])
            
            for _, prepre_skill in G.out_edges(prerec_skill):
                solved_rec = solved_rec.union(stud_info.skill_solved[prepre_skill])
                attemp_rec = attemp_rec.union(stud_info.skill_attemps[prepre_skill])            
   
    res = {
        'solved_cur': len(solved_cur),
        'attemp_cur': len(attemp_cur),
        'solved_rec': len(solved_rec),
        'attemp_rec': len(attemp_rec),
    }
    return res

In [75]:
%%time
train_for_student['g_inf'] = train_for_student.apply(compute_stud_info_for_one_stud_add_inf, axis=1)

CPU times: user 1.7 s, sys: 67 µs, total: 1.7 s
Wall time: 1.7 s


In [78]:
train_for_student['solved_cur'] =  train_for_student['g_inf'].apply(lambda x: x['solved_cur'] / x['attemp_cur'] if x['attemp_cur'] else 0.5)
train_for_student['attemp_cur'] =  train_for_student['g_inf'].apply(lambda x: x['attemp_cur'])
train_for_student['solved_rec'] =  train_for_student['g_inf'].apply(lambda x: x['solved_rec'] / x['attemp_rec'] if x['attemp_rec'] else 0.5)
train_for_student['attemp_rec'] =  train_for_student['g_inf'].apply(lambda x: x['attemp_rec'])

In [79]:
train_for_student.head(2)

Unnamed: 0,problem_id,mean_problem_solved,execution_time,difficulty,has_diff_info,cls_student_id,mean_student_solved,execution_time_stud,g_inf,solved_cur,attemp_cur,solved_rec,attemp_rec
0,31964,0.666667,417.0,0.336907,0,1450857,0.353416,397.0,"{'solved_cur': 1, 'attemp_cur': 2, 'solved_rec...",0.5,2,0.419689,193
1,31980,0.695652,71.0,0.079576,0,1450857,0.353416,397.0,"{'solved_cur': 2, 'attemp_cur': 4, 'solved_rec...",0.5,4,0.583333,48


In [80]:
X_train.columns

Index(['assignment_level', 'mean_problem_solved', 'execution_time_problem',
       'difficulty', 'has_diff_info', 'mean_student_solved',
       'execution_time_student', 'solved_cur', 'attemp_cur', 'solved_rec',
       'attemp_rec'],
      dtype='object')

In [85]:
sdf['assignment_level'].iloc[0]

3

In [86]:
train_for_student['assignment_level'] = 3

In [91]:
train_for_student['execution_time_problem'] = train_for_student['execution_time']
train_for_student['execution_time_student'] = train_for_student['execution_time_stud']

In [92]:
set(X_train.columns).difference(train_for_student.columns)

set()

In [93]:
X_student_test = train_for_student[X_train.columns]

In [95]:
pred_proba = cat_class.predict_proba(X_student_test)[:,1]

In [97]:
train_for_student['proba'] = pred_proba

In [137]:
res = train_for_student[(train_for_student['attemp_cur'] == 0) & (train_for_student['attemp_rec'] > 5)]

In [139]:
student_info = res[['problem_id', 'proba']].sort_values(by=['proba'])

In [156]:
def get_bad_skills(stud_info):
    bad_skills = defaultdict(int)
    for c in stud_info.iloc[:30].iloc:
        for skill in problem_to_skill[c['problem_id']]:
            if len(G.out_edges(skill)):
                bad_skills[skill] += 1
    bad_skills = dict(bad_skills)
    global f
    f = bad_skills
    
    return sorted(bad_skills, key=lambda x: bad_skills[x], reverse=True)[:5]

In [157]:
get_bad_skills(student_info)

['воспроизводить циклический алгоритм письменного деления (последовательность действий) n-значных чисел (n>=3) Повторяются выделенные операции 1) Выделяю первое неполное делимое (и называю количество цифр в частном) 2) Делю 3) Умножаю 4) Вычитаю 5) сравниваю остаток с делителем',
 'выражать n-значные числа (где n>=4), заданные в единицах 1 разряда, в единицах других разрядов (например: 2300 = 23 сот.) и наоборот (например, 25 сот.= 2500)',
 'находить общее число единиц какого-либо разряда (например, число 82345 содержит всего 8дес. тыс., 82 ед. тыс., 823 сот., 8234 дес., 82345 ед.)',
 'извлекать и анализировать необходимую информацию из двумерной таблицы без числовых данных (наличие указанных свойств отмечено знаками) и анализировать ее',
 'подбирать объекты по описанию их взаимного расположения в пространстве с точки зрения ребенка - наблюдателя']

In [160]:
student_info.iloc[:30]

Unnamed: 0,problem_id,proba
13546,187722,0.014179
13547,187729,0.014971
3034,88382,0.018661
18134,231447,0.021016
17975,229919,0.021051
2255,81735,0.023566
17451,224581,0.024777
2887,87382,0.026507
16559,215311,0.029043
12631,181279,0.030243


In [158]:
f

{'извлекать и анализировать необходимую информацию из двумерной таблицы без числовых данных (наличие указанных свойств отмечено знаками) и анализировать ее': 2,
 'подбирать объекты по описанию их взаимного расположения в пространстве с точки зрения ребенка - наблюдателя': 2,
 'выражать n-значные числа (где n>=4), заданные в единицах 1 разряда, в единицах других разрядов (например: 2300 = 23 сот.) и наоборот (например, 25 сот.= 2500)': 3,
 'составлять план решения задачи на основе рассуждений от данных к вопросу': 1,
 'решать "классические" задачи на нахождение неизвестных по двум разностям': 1,
 'использовать транспортир для измерения величины угла': 1,
 'извлекать необходимую информацию из текста': 1,
 'использовать сочетательное свойство умножения с опорой на модели': 1,
 'находить общее число единиц какого-либо разряда (например, число 82345 содержит всего 8дес. тыс., 82 ед. тыс., 823 сот., 8234 дес., 82345 ед.)': 3,
 'воспроизводить циклический алгоритм письменного деления (последо

In [106]:
problem_to_skill[187729]

['извлекать и анализировать необходимую информацию из двумерной таблицы без числовых данных (наличие указанных свойств отмечено знаками) и анализировать ее',
 'дополнять объект с учетом нескольких условий',
 'подбирать объекты по описанию их взаимного расположения в пространстве с точки зрения ребенка - наблюдателя']

In [151]:
G.out_edges('решать задачи на упорядочивание множеств /величин')

OutEdgeDataView([('решать задачи на упорядочивание множеств /величин', 'решать логические задачи разных типов')])

In [150]:
problem_to_skill

{78236: ['решать задачи на нахождение числа элементов пересечения, разности, объединения конечных множеств (например, в классе из 25 человек 12 человек увлекаются спортом, 7 человек увлекаются музыкой, причем 3 человека увлекаются музыкой и спортом. Сколько ребят еще не имеют увлечений?)',
  'сравнивать/упорядочивать числа меньшие или равные 10, на основе о знаний о месте чисел в натуральной последовательности (например, 5 меньше 7 , так как его при счете называют раньше)',
  'решать задачи на упорядочивание множеств /величин',
  'решать задачи на нахождение числа элементов пересечения, разности, объединения конечных множеств (например, в классе из 25 человек 12 человек увлекаются спортом, 7 человек увлекаются музыкой, причем 3 человека увлекаются музыкой и спортом. Сколько ребят еще не имеют увлечений?)',
  'сравнивать/упорядочивать числа меньшие или равные 10, на основе о знаний о месте чисел в натуральной последовательности (например, 5 меньше 7 , так как его при счете называют рань

In [104]:
problem_to_skill[187722]

['извлекать и анализировать необходимую информацию из двумерной таблицы без числовых данных (наличие указанных свойств отмечено знаками) и анализировать ее',
 'дополнять объект с учетом нескольких условий',
 'подбирать объекты по описанию их взаимного расположения в пространстве с точки зрения ребенка - наблюдателя']