In [48]:
import torch
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from transformers import Trainer, AutoTokenizer, AutoModel, AutoConfig
from sklearn.model_selection import KFold
import pandas as pd
import tqdm

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [18]:
class CppDataset(Dataset):
    def __init__(self):
        # initializing - read text
        self.code_df = pd.read_csv("Datasets/spoc-train.tsv", sep='\t')
        self.code_tokenizer = AutoTokenizer.from_pretrained("neulab/codebert-cpp")
        self.text_tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
        
        self.x, self.y = read_code(self.code_df)
        self.x = [self.tokenize_code(code) for code in self.x] #code를 tokenize한 결과를 저장
        self.y = [self.tokenize_text(text) for text in self.y] #text를 tokenize한 결과를 저장

    def __getitem__(self, index):
        inputs = {
            "input_ids" : torch.tensor(self.x[index]["input_ids"], dtype = torch.long, device = device),
            "attention_mask" : torch.tensor(self.x[index]["attention_mask"], dtype = torch.long, device = device)
        }
        targets = torch.tensor(self.y[index], dtype=torch.long)
        return inputs, targets
    
    def __len__(self):
        return len(self.x)
    
    def tokenize_code(self, code):
        return self.code_tokenizer(code, truncation=True, padding='max_length')
    
    def tokenize_text(self, text):
        tokenized = self.text_tokenizer(text, truncation=True)["input_ids"]
        return tokenized + [0 for i in range(512 - len(tokenized))]
    

def read_code(code_df: pd.DataFrame):
    x = []
    y = []
    
    line = -2
    indent = ''
    for i in range(len(code_df)):
        # print(i)
        indent = '\t' * code_df['indent'][i]
        if (code_df['line'][i] == line + 1):
            if (not pd.isna(code_df['code'][i])): x[len(x)-1] += indent + code_df['code'][i] + '\n'
            if (not pd.isna(code_df['text'][i])): y[len(y)-1] += indent + code_df['text'][i] + '\n'
        else:
            line = -1
            if (not pd.isna(code_df['code'][i])):
                x.append(indent + code_df['code'][i] + '\n')
            else: x.append("")
            if (not pd.isna(code_df['text'][i])):
                y.append(indent + code_df['text'][i] + '\n')
            else: y.append("")
        line += 1
        
    return x, y

In [46]:
class C2PSModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained("neulab/codebert-cpp")
        self.linear = torch.nn.Linear(768, 768, bias=True)
        self.gpt_config = AutoConfig.from_pretrained("openai-gpt")
        self.decoder = AutoModel.from_config(self.gpt_config)

    def forward(self, inputs):
        outputs = self.encoder(**inputs)
        outputs = self.linear(outputs)
        outputs = self.decoder(inputs_embeds=outputs)
        return outputs

In [47]:
C2PSModel()

Some weights of the model checkpoint at neulab/codebert-cpp were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


C2PSModel(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),

In [None]:
def train(batch_size, epochs, lr=1e-05):
    dataset = CppDataset()
    splits = KFold(n_splits=5, shuffle=True)
    
    for fold, (train_index, valid_index) in enumerate(splits.split(dataset)):
        
        model = C2PSModel().to(device) # fold 별로 모델을 새로 쓰기 때문
        
        # kfold로 구분된 index를 이용해 데이터로더 생성
        train_sampler = SubsetRandomSampler(train_index)
        valid_sampler = SubsetRandomSampler(valid_index)
        
        dataloader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
        val_dataloader = DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler)
        
        loss_fn = ## loss function 정의
        optimizer = ## 최적화 방식 정의
        
        # train
        for epoch in range(epochs):
            progress = tqdm.tqdm(dataloader, total=len(dataloader)) # 학습 시 나오는 progress bar
            
            for i, (inputs, targets) in enumerate(progress):
                ## 여기부터는 해보십쇼
            
            
            # validation check: no train
            with torch.no_grad():
                ## 여기도 해보십쇼
                ## Accuracy 출력
                
            ## epoch 별 loss 를 출력해도 됩니다 안해도 됩니다
        
            ## early stopping을 적용해도 됩니다
            
        ## fold 별 loss를 출력
        
        ## fold 별 모델을 저장
        torch.save(model.state_dict(), f'ckpt_fold{fold}.pth')

In [None]:
# main

BATCH_SIZE = ##
EPOCHS = ##
lr = ## learning rate decay를 적용해도 됨 아니면 그냥 상수로

train(BATCH_SIZE, EPOCHS, lr=lr)