In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm, trange

import matplotlib.pyplot as plt
import seaborn as sns
import wandb
%matplotlib inline

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

In [2]:
date_cols = ['attempts_date_created', 'cl_date_assignment', 'cls_date_created']
df = pd.read_csv('wide_math.csv', parse_dates=date_cols, index_col=False)

In [3]:
# df['attempts_date_created'] = df['attempts_date_created'].dt.floor('s')
# df['cls_date_created'] = df['cls_date_created'].dt.floor('s')
# df.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1).to_csv('wide_math.csv', index=False)

In [4]:
df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,assignment_level,attempts_date_created,cl_date_assignment,cl_id,cls_date_created,cls_student_id,course_id,is_solved,problem_id,subject_slug,team_id,team_level,tp_teacher_id
0,0,0,3,2022-03-08 11:02:37+03:00,2022-03-05 13:00:00+03:00,71374307,2022-03-08 10:52:43+03:00,1650006,5096626,1,97304,mathematics,82516,4,520541
1,1,1,3,2022-03-08 11:02:49+03:00,2021-09-02 14:23:50+03:00,70879797,2022-03-08 11:02:08+03:00,1494142,5099807,1,227546,mathematics,69290,4,615530


- assignment_level - класс, когда он решал
- attempts_date_created - время поптыки
- cl_date_assigment - время выдачи задачи
- cl-id - id урока, который выдан студентам этого класса. урок - набор карточек, который выдал учитель.
- cls_date_created - когда приступил к уроку
- cls_student_id - id ученика
- course_id - id курса (предмет-учитель-assignment_level)
- is_solved - решена ли задача
- problem-id - id задачи
- subject_slug - предмет
- team-id - id класса
- team_level - текущик класс ученика
- tp_teacher_id - id препода

In [5]:
# mark_up_math = pd.read_excel('markup_math.xlsx')
# mark_up_math= mark_up_math[1:]
# mark_up_math.head(4) # очень грязные данные((((

In [6]:
med_date = df['attempts_date_created'].median()
med_user = df['cls_student_id'].median()

In [7]:
post_df = df[df['attempts_date_created'] > med_date]
before_df = df[df['attempts_date_created'] <= med_date]

In [8]:
len(set(post_df['problem_id']).difference(before_df['problem_id'])) / len(set(post_df['problem_id']))

0.357354172481161

Если отсекать по датасет по времени, то есть куча задач, про которые мы не знаем, так как нет информации прошлых лет их решения. Предлагаю делить датасет по времени и пользователям и взять 1/4 от датасета для валидации

In [9]:
test_index = (df['cls_student_id'] > med_user) & (df['attempts_date_created'] > med_date)
train_index = ~test_index

In [10]:
def get_matr(data):
    data = data.sort_values(by=['problem_id', 'cls_student_id', 'attempts_date_created'])
    return data.groupby(by=['problem_id', 'cls_student_id'], as_index=False)['is_solved'].first()

In [11]:
df_train = get_matr(df[train_index])
df_test = get_matr(df[test_index])

print(df_train.shape)
print(df_test.shape)
df_train.head(3)

(3003185, 3)
(1005800, 3)


Unnamed: 0,problem_id,cls_student_id,is_solved
0,20000,1550166,0
1,20000,1550167,1
2,20000,1550169,1


In [12]:
df_train['is_solved'].sum() / df_train.shape[0], df_test['is_solved'].sum() / df_test.shape[0]

(0.7478247260824757, 0.7262736130443428)

Есть разбаланс классов. Ученики чаще решают с первого раза, чем не решают.

In [13]:
train_problems = set(df_train['problem_id'])
test_problems = set(df_test['problem_id'])


len(test_problems.difference(train_problems)) / len(test_problems)

0.1368838357393971

У нас ещё осталось 13% задач в тестовой выборке, про которые мы ничего не знаем
Пока есть предложение на них забить. Тем более это логично забить на задачи, про которые нам ещё ничего не известно.
(в дальнейшем есть идея для них эмбединги получать из графа)

In [14]:
train_students = set(df_train['cls_student_id'])
test_students = set(df_test['cls_student_id'])

len(test_students.difference(train_students)) / len(test_students)

0.0297008547008547

Есть 3 процента пользователей, которые не попали в тестовую выборку. Пока что от них тоже избавимся. У меня есть идеи, как более качественно поделить выборку, но пока напишем базу.

In [15]:
df_test = df_test[df_test['problem_id'].apply(lambda x: x in train_problems)]
df_test = df_test[df_test['cls_student_id'].apply(lambda x: x in train_students)]
df_test.shape

(943531, 3)

In [16]:
problems = list(train_problems)
problem_to_index = {problem_id: i for i, problem_id in enumerate(problems)}
students = list(train_students)
student_to_index = {student_id: i for i, student_id in enumerate(students)}

Теперь напишем стандарные штуки для обучения (сворую из другого курса).

In [17]:
class StudentsProblemsDataset(Dataset):
    def __init__(self, data, student_to_index=student_to_index, problem_to_index=problem_to_index):
        students = data['cls_student_id'].apply(lambda x:student_to_index[x]).values
        problems = data['problem_id'].apply(lambda x:problem_to_index[x]).values
        
        self.students = torch.tensor(students)
        self.problems = torch.tensor(problems)
        self.solved = torch.tensor(data['is_solved'].to_numpy())

    def __len__(self):
        return len(self.students)

    def __getitem__(self, idx):
        return self.students[idx], self.problems[idx], self.solved[idx]

In [18]:
train_dataset = StudentsProblemsDataset(df_train)
test_dataset = StudentsProblemsDataset(df_test)

In [26]:
from IPython.display import clear_output

In [27]:
def get_TP(y_pred, y_true, target=1):
    return ((y_pred == target) * (y_true == target)).sum()

def get_FP(y_pred, y_true, target=1):
    return ((y_pred == target) * (y_true != target)).sum()

def get_FN(y_pred, y_true, target=1):
    return ((y_pred != target) * (y_true != target)).sum()

In [33]:
def get_precision(y_pred, y_true, target=1):
    TP = get_TP(y_pred, y_true)
    FP = get_FP(y_pred, y_true)
    
    return TP / (TP + FP)

def get_recall(y_pred, y_true, target=1):
    TP = get_TP(y_pred, y_true)
    FN = get_FN(y_pred, y_true)
    
    return TP / (TP + FN)

@torch.no_grad()
def evaluate(model, dataloader, criterion, target=1, device=DEVICE):
    TP = 0
    FP = 0
    FN = 0
    loss = 0
    
    for st, pr, y_true in dataloader:
        st = st.to(DEVICE)
        pr = pr.to(DEVICE)
        y_true = y_true.to(DEVICE)
        
        y_pred= model(st, pr)
        
        loss += criterion(y_pred, y_true).item()
        
        y_pred = torch.argmin(y_pred, dim=1)
        
        TP += get_TP(y_pred, y_true, target)
        FP += get_FP(y_pred, y_true, target)
        FN += get_FN(y_pred, y_true, target)
    
    precision = (TP / (TP + FP)).item()
    recall = (TP / (TP + FN)).item()
    return loss, precision, recall

def plot_results(train_loss, test_loss, train_precision, test_precision, train_recall, test_recall):
    clear_output(True)
    
    print(f"Cur test loss:{test_loss[-1]:.3}")
    print(f"Cur test precision:{test_precision[-1]:.3}")
    print(f"Cur test recall:{test_recall[-1]:.3}")
    
    x = list(range(len(train_loss)))
    
    plt.figure(figsize=(10, 6))
    
    plt.subplot(3, 1, 1)
    plt.title('loss')
    plt.plot(x, train_loss, label='train')
    plt.plot(x, test_loss, label='test')
    plt.ylim(bottom=0)
    plt.grid()
    plt.legend()
    
    plt.subplot(3, 1, 2)
    plt.title('precison')
    plt.plot(x, train_precision, label='train')    
    plt.plot(x, test_precision, label='test')
    plt.ylim((0, 1))
    plt.grid()
    plt.legend()
    
    plt.subplot(3, 1, 3)    
    plt.plot(x, train_recall, label='train')
    plt.plot(x, test_recall, label='test')
    plt.ylim((0, 1))
    plt.grid()
    plt.legend()
        
    plt.show()

In [34]:
class KindOfAlsModel(nn.Module):
    def __init__(self, n_students=len(students), n_problems=len(problems), emb_size=16):
        super().__init__()
        
        self.stud_embed = nn.Embedding(n_students, emb_size)
        self.problem_embed = nn.Embedding(n_problems, emb_size)
        self.tanh = nn.Tanh()
        
    def forward(self, students, problems):
        students = self.stud_embed(students)
        problems = self.problem_embed(problems)
        
        solved = self.tanh(torch.mul(students, problems).sum(dim=1)) / 2 + 0.5
        not_solved = 1 - solved
        return torch.stack([1-solved, solved]).transpose(-2, -1)

In [35]:
class MyModel(nn.Module):
    def __init__(self, n_students=len(students), n_problems=len(problems), emb_size=16):
        super().__init__()
        
        self.stud_embed = nn.Embedding(n_students, emb_size)
        self.problem_embed = nn.Embedding(n_problems, emb_size)
        self.tanh = nn.Tanh()
        
    def forward(self, students, problems):
        students = self.stud_embed(students)
        problems = self.problem_embed(problems)
        
        solved = self.tanh(torch.mul(students, problems).sum(dim=1)) / 2 + 0.5
        not_solved = 1 - solved
        return torch.stack([1-solved, solved]).transpose(-2, -1)

In [36]:
model = KindOfAlsModel().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

wandb.init(project="ysda-school-simple-als")


batch_size=20000
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=12)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=12)

VBox(children=(Label(value='0.001 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.148880…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668314083411438, max=1.0…

In [37]:
eval_every = 5
epochs = 200

wandb.config = {
  "learning_rate": 0.001,
  "epochs": epochs,
  "batch_size": batch_size,
}


for epoch in trange(epochs):
    for st, pr, target in train_dataloader:
        st = st.to(DEVICE)
        pr = pr.to(DEVICE)
        target = target.to(DEVICE)
        
        y_pred= model(st, pr)
        
        loss = criterion(y_pred, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch + 1) % eval_every == 0:
        train_loss, train_precision, train_recall = evaluate(model, train_dataloader, criterion, target=1)
        test_loss, test_precision, test_precision = evaluate(model, test_dataloader, criterion, target=1)
        
        wandb.log({"train/train_loss": train_loss})
        wandb.log({"train/train_precision_t1": train_precision})
        wandb.log({"train/train_recall_t1": train_recall})
        wandb.log({"test/test_loss_t1": train_loss})
        wandb.log({"test/test_precision_t1": test_precision})
        wandb.log({"test/test_recall_t1": test_precision})
        
        _, train_precision, train_recall = evaluate(model, train_dataloader, criterion, target=0)
        _, test_precision, test_precision = evaluate(model, test_dataloader, criterion, target=0)
        
        wandb.log({"train/train_precision_t0": train_precision})
        wandb.log({"train/train_recall_t0": train_recall})
        wandb.log({"test/test_precision_t0": test_precision})
        wandb.log({"test/test_recall_t0": test_precision})

  0%|          | 0/200 [00:00<?, ?it/s]

In [38]:
model = KindOfAlsModel().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

wandb.init(project="ysda-school-emb-lin")

batch_size=20000
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=12)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=12)

VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.133438…

0,1
test/test_loss_t1,██▇▇▆▆▅▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test/test_precision_t0,▁▁▁▁▁▂▂▃▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇████████████████
test/test_precision_t1,█████▇▇▆▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test/test_recall_t0,▁▁▁▁▁▂▂▃▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇████████████████
test/test_recall_t1,█████▇▇▆▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/train_loss,██▇▇▆▆▅▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/train_precision_t0,█▇▆▆▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train/train_precision_t1,██▇▇▇▇▆▆▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train/train_recall_t0,▁▁▁▁▁▂▂▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████
train/train_recall_t1,█████▇▇▆▅▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
test/test_loss_t1,67.83061
test/test_precision_t0,0.50297
test/test_precision_t1,0.49703
test/test_recall_t0,0.50297
test/test_recall_t1,0.49703
train/train_loss,67.83061
train/train_precision_t0,0.10837
train/train_precision_t1,0.20819
train/train_recall_t0,0.66141
train/train_recall_t1,0.33859


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669041416510783, max=1.0…

In [39]:
eval_every = 5
epochs = 200

wandb.config = {
  "learning_rate": 0.001,
  "epochs": epochs,
  "batch_size": batch_size,
}


for epoch in trange(epochs):
    for st, pr, target in train_dataloader:
        st = st.to(DEVICE)
        pr = pr.to(DEVICE)
        target = target.to(DEVICE)
        
        y_pred= model(st, pr)
        
        loss = criterion(y_pred, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch + 1) % eval_every == 0:
        train_loss, train_precision, train_recall = evaluate(model, train_dataloader, criterion)
        test_loss, test_precision, test_precision = evaluate(model, test_dataloader, criterion)
        
        wandb.log({"train/train_loss": train_loss})
        wandb.log({"train/train_precision_t1": train_precision})
        wandb.log({"train/train_recall_t1": train_recall})
        wandb.log({"test/test_loss_t1": train_loss})
        wandb.log({"test/test_precision_t1": test_precision})
        wandb.log({"test/test_recall_t1": test_precision})
        
        _, train_precision, train_recall = evaluate(model, train_dataloader, criterion, target=0)
        _, test_precision, test_precision = evaluate(model, test_dataloader, criterion, target=0)
        
        wandb.log({"train/train_precision_t0": train_precision})
        wandb.log({"train/train_recall_t0": train_recall})
        wandb.log({"test/test_precision_t0": test_precision})
        wandb.log({"test/test_recall_t0": test_precision})

  0%|          | 0/200 [00:00<?, ?it/s]