In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import spacy
from tqdm import tqdm
import re
from multiprocessing import Pool

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import log_loss

# PyTorch
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch.nn.functional as F
import torch.nn as nn

# BERT
from transformers import DistilBertTokenizer, DistilBertModel
import transformers
from transformers import get_scheduler

TRAIN_DATA_FILE = '../input/AI4Code/train/'
TEST_DATA_FILE = '../input/AI4Code/test/'
TRAIN_ORDER_CSV = '../input/AI4Code/train_orders.csv'
ANCESTORS_CSV = '../input/AI4Code/train_ancestors.csv'
SUBMISSION_CSV = '../input/AI4Code/sample_submission.csv'
PATH_TO_SAVE = './state_dict_model.pth'

BERT_MODEL = 'distilbert-base-uncased'
USE_PRETRAINED_WEIGHTS = True  # True - загрузить предобученные веса модели, False - учить модель с нуля
USE_WORD_PROCESS = False  # True - обрабатывать сырые данные, False - загружать предобработанные данные


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SEED = 103

# Загрузка и обработка данных

In [2]:
def adding_rank(orders_file: str) -> pd.DataFrame:
    # Добавление ранга к значениям из train_ancestors.csv
    
    new_columns = {'id': [], 'cell_order': [], 'rank': []}
    order_df = pd.read_csv(orders_file)
    
    for i in range(order_df.shape[0]):
        orders = order_df.at[i, 'cell_order'].split()
        new_columns['cell_order'].extend(orders)
        new_columns['id'].extend([order_df.at[i, 'id']] * len(orders))
        new_columns['rank'].extend(range(len(orders)))
    
    return pd.DataFrame(new_columns)


if not USE_PRETRAINED_WEIGHTS:
    orders_df = adding_rank(TRAIN_ORDER_CSV)
    orders_df.head()

In [3]:
# Создание нового датафрейма с целевыми значениями 

def target_frame(training_files: list, orders_N: pd.DataFrame, 
                 random_percent: float = 0.05) -> pd.DataFrame:
    
    target_df = pd.DataFrame({
        'id': pd.Series(dtype='object'),
        'source_markdown': pd.Series(dtype='object'),
        'source_code': pd.Series(dtype='object'),
        'lable': pd.Series(dtype='int')
    })
    
    for i in training_files:
        id_local = i.split('.')[0]
        or_N = orders_N[orders_N['id'] == id_local]
        json_df = pd.read_json(TRAIN_DATA_FILE + i).reset_index().rename(columns={"index": "cell_order"})
        or_N = or_N.merge(json_df, on='cell_order', how='left')

        for j in range(1, or_N.shape[0]):
            if or_N.at[j-1, 'cell_type'] == 'markdown' and or_N.at[j, 'cell_type'] == 'code':
                # Выбор целевых ячеек из датафрейма
                new_row = {'id': id_local, 'source_markdown': or_N.at[j-1, 'source'],
                           'source_code': or_N.at[j, 'source'], 'lable': 1}
                target_df = target_df.append(new_row, ignore_index=True)

            elif np.random.random() > 1 - random_percent:
                # Выбор случайных ячеек из датафрейма
                random_row_md = or_N[or_N['cell_type'] == 'markdown'].sample(n=1, ignore_index=True, random_state=SEED)
                random_row_code = or_N[or_N['cell_type'] == 'code'].sample(n=1, ignore_index=True, random_state=SEED)
                
                new_row = {'id': id_local, 'source_markdown': random_row_md.at[0, 'source'],
                           'source_code': random_row_code.at[0, 'source'], 'lable': 0}
                target_df = target_df.append(new_row, ignore_index=True)
                
    return target_df
  

def multp_target_frame(base_part: str) -> pd.DataFrame:
    return target_frame(base_part, orders_df)


In [4]:
all_train_files = os.listdir(TRAIN_DATA_FILE)
ancestor_df = pd.read_csv(ANCESTORS_CSV)

# Размер шага для разделения файлов
file_cell_size = 1000

   
if __name__ == '__main__':
    if USE_WORD_PROCESS:
        with Pool(processes=4) as pool:
            processed_cells = list(tqdm(
                pool.imap(
                    multp_target_frame,                                  # len(all_train_files)
                    [all_train_files[i: i+file_cell_size] for i in range(0, 60001, file_cell_size)]
                ), 
                total=60001 // file_cell_size))
            # Ограничил размеры датасета 60001

        df = pd.concat(processed_cells, ignore_index=True)\
            .merge(ancestor_df[['id', 'ancestor_id']], on='id', how='left')
#         df.to_csv("result_df.csv", index=False)

## Обработка ячеек кода

In [5]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text: str) -> str:
        # Удаление спецсимволов
        text = re.sub(r'\W', ' ', str(text))

        # Удаление одиночных букв
        text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

        # Удаление одиночных символов в начале
        text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)

        # Замена нескольких пробелов одним
        text = re.sub(r'\s+', ' ', text, flags=re.I)

        # Удаление префикса 'b'
        text = re.sub(r'^b\s+', '', text)

        # Перевод в нижний регистр
        text = text.lower()

        # Лемметизация
        doc = nlp(text)
        tokens = filter(lambda x: len(x) > 3, [token.lemma_ for token in doc])

        return ' '.join(tokens)
    
    
def worker(x: int) -> str:
    return preprocess_text(df.at[x, 'source_markdown'])


In [6]:
# Обработка ячеек типа markdown

if __name__ == '__main__':
    if USE_WORD_PROCESS:
        with Pool(processes=4) as pool:
            processed_cells = list(tqdm(
                pool.imap(worker, range(df.shape[0])), total=df.shape[0]))

        df.source_markdown = processed_cells
    else:
        df = pd.read_csv('../input/model-state-dict1/result_df2.csv')

    df.head()
# df.to_csv("result_df2.csv", index=False)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
drop_ind = []
for i in range(df.shape[0]):
    if df.at[i, 'lable'] not in ('0', '1', 1, 0):
        drop_ind.append(i)
df.drop(index=drop_ind, inplace=True)
df = df.astype({'lable': 'int32'}).reset_index(drop=True)

# Создание PyTorch Dataset

In [8]:
class AIDataset(Dataset):
    
    def __init__(self, data, max_len=512):
        super().__init__()
        
        self.tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL)

        # Максимальная длинна токенов для BERT
        self.max_len = max_len
        self.data = data
        
    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, index):
        
        row = self.data.iloc[index]
        text = str(row.source_markdown) + '[SEP]' + str(row.source_code)
        
        
        # Токенизация текста
        inputs = self.tokenizer(text,
                                max_length=self.max_len,
                                padding="max_length",
                                truncation=True)
        
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        target = torch.tensor(row.lable, dtype=torch.float)
        
        
        return {"input_ids" : ids,
                "attention_mask" : mask,
                "target" : target}
    

In [9]:
# Проверка работы генератора данных
dataset = AIDataset(df[:10])
dataloader = DataLoader(dataset, batch_size=3, shuffle=False)

for k, data in enumerate(dataloader):
    ids, mask, target = data.values()
    print(f"Batch: {k}", "\n" +
          "Ids:", ids, "\n" +
          "Mask:", mask, "\n" +
          "Target:", target, "\n" +
          "="*50)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Batch: 0 
Ids: tensor([[  101,  9699,  2678,  ...,  1011,  6187,   102],
        [  101,  9699,  2678,  ...,     0,     0,     0],
        [  101, 13590,  2951,  ...,     0,     0,     0]]) 
Mask: tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]) 
Target: tensor([0., 1., 1.]) 
Batch: 1 
Ids: tensor([[  101,  3853,  3556,  ...,     0,     0,     0],
        [  101,  6366, 18162,  ...,     0,     0,     0],
        [  101,  2981,  8023,  ...,     0,     0,     0]]) 
Mask: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]) 
Target: tensor([1., 1., 1.]) 
Batch: 2 
Ids: tensor([[  101,  7790,  8023,  ...,     0,     0,     0],
        [  101,  7790,  8023,  ...,     0,     0,     0],
        [  101, 23755,  2819,  ...,     0,     0,     0]]) 
Mask: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]) 
Target: tensor([1., 1., 1.]) 
Batch: 

# Создание модели

In [10]:
class BertModel(nn.Module):
    
    def __init__(self, bert_model, layer_size, drop_out=0.15):
        super(BertModel, self).__init__()
    
        self.bert_model = DistilBertModel.from_pretrained(bert_model)
        self.dropout = nn.Dropout(drop_out)
        self.linear = nn.Linear(layer_size, 1)
        
    def forward(self, ids, mask):
        '''
        '''
        text = self.bert_model(ids, mask)[0]
        out = self.linear(text)
        out = self.dropout(out)
        out = out[:, 0, :]
        return out

# Тренировачная и валидационная выборка

In [11]:
# Разбиение выборки на тренировочную и валидационную

gss = GroupShuffleSplit(n_splits=1, train_size=.9, random_state=SEED)
ind_train, ind_valid = next(gss.split(df, groups=df.ancestor_id))

train_df = df.loc[ind_train].reset_index(drop=True)
valid_df = df.loc[ind_valid].reset_index(drop=True)

In [12]:
def train_test_loader(train: pd.DataFrame, test: pd.DataFrame, batch_size: int = 16) -> DataLoader:
#     Загрузчик для тренировочной и валидационной выборки
    dataset_train = AIDataset(train)
    dataset_test = AIDataset(test)
    
    dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)
    
    return dataloader_train, dataloader_test


# Тренировка нейросети

In [13]:
def train_model(path: str, epochs: int = 3) -> None:
    
    train_load, valid_load = train_test_loader(train_df, valid_df)
    model = BertModel(BERT_MODEL, 768).to(DEVICE)
    
    num_training_steps = epochs * len(train_load)
    optimizer = AdamW(model.parameters(), lr=5e-5)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    
    loss_fn = nn.BCEWithLogitsLoss()
    base_res = 10
    
    for epoch in range(epochs):
        model.train()
        loss_epoch = []
        
        for data in tqdm(train_load, desc=f'Эпоха:{epoch + 1}, тренировочная'):
            ids, mask, target = data.values()
            optimizer.zero_grad()
            out = model(ids.to(DEVICE), mask.to(DEVICE))
            loss = loss_fn(out, target.unsqueeze(-1).to(DEVICE))
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            loss_epoch.append(loss.cpu().detach().numpy().tolist())
            
        print('Средняя потеря за эпоху(тренировочные данные):', np.mean(loss_epoch))
        
        model.eval()
        valid_preds, valid_targets = [], []
        
        for data in tqdm(valid_load, desc=f'Эпоха:{epoch + 1}, тестовая'):
            ids, mask, target = data.values()
            with torch.no_grad():
                out = model(ids.to(DEVICE), mask.to(DEVICE))
            valid_preds.append(torch.sigmoid(out).detach().cpu().numpy().ravel())
            valid_targets.append(target.unsqueeze(-1).detach().cpu().numpy().ravel())
            
        valid_preds = np.concatenate(valid_preds)
        valid_targets = np.concatenate(valid_targets)
        l_los = log_loss(valid_targets, valid_preds)
        if base_res > l_los:
            # Сохранение модели с лучшими параметрами
            torch.save(model.state_dict(), path)
            base_res = l_los
        print('Средняя потеря за эпоху(валидационные данные):', l_los)
    
    
if USE_PRETRAINED_WEIGHTS:
    model = BertModel(BERT_MODEL, 768).to(DEVICE)
    model.load_state_dict(torch.load('../input/model-state-dict1/state_dict_model.pth',
                                 map_location=torch.device(DEVICE)))
else:
    train_model(PATH_TO_SAVE)
    model = BertModel(BERT_MODEL, 768).to(DEVICE)
    model.load_state_dict(torch.load(PATH_TO_SAVE,
                                 map_location=torch.device(DEVICE)))

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Предсказание

In [14]:
tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL)

def tokeniz_test_df(markdown: str, code: str) -> float:
#     Токенизация тестовых данных
    text = markdown + '[SEP]' + code
    inputs = tokenizer(text,
                        max_length=512,
                        padding="max_length",
                        truncation=True)
    
    ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0)
    mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0)
    
    model.eval()
    with torch.no_grad():
        out = model(ids.to(DEVICE), mask.to(DEVICE))

    return torch.sigmoid(out).detach().cpu().item()

In [15]:
def cell_arrangement(code_df: pd.DataFrame, md_df: pd.DataFrame) -> list:
    # Составление списка с предсказанной последовательностью 
    # кода и ячеек комментария
    result_code_list = code_df.cell_order.to_list()

    for i in md_df.itertuples():
        md_cell = i.cell_order
        comment = i.source

        result_dict = {
            j.cell_order: tokeniz_test_df(comment, j.source) 
            for j in code_df.itertuples()
        }
        max_address = max(result_dict, key=result_dict.get)
        result_code_list.insert(result_code_list.index(max_address), md_cell)
        
    return result_code_list

In [16]:
def submission_proc(test_dir: str, submission_dir: str) -> pd.DataFrame:
    # Загрузка файля для предсказания последовательности
    submission_df = pd.read_csv(submission_dir)
    
    for test_file in tqdm(os.listdir(test_dir)):
        name_file = test_file.split('.')[0]
        
        # Загрузка данных
        test_df = pd.read_json(test_dir + test_file).assign(id=name_file)\
                .reset_index().rename(columns={"index": "cell_order"})
        
        # Разделение датафрейма на два: с кодом и с комментариями
        test_df_code = test_df[test_df['cell_type'] == 'code']
        test_df_md = test_df[test_df['cell_type'] == 'markdown']
        test_df_md['source'] = test_df_md['source'].apply(preprocess_text)

        ind = submission_df[submission_df.id == name_file].index
        submission_df.loc[ind, 'cell_order'] = ' '.join(
            cell_arrangement(test_df_code, test_df_md)
        )
    return submission_df
        
        
        
result_df = submission_proc(TEST_DATA_FILE, SUBMISSION_CSV)
result_df.to_csv("submission.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
100%|██████████| 4/4 [00:15<00:00,  3.92s/it]
