In [10]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [11]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import random
import torch
from transformers import AutoModel, AutoTokenizer

class PairDataset(Dataset):
    def __init__(self, data_filepath, params, kind='train'):
        if not isinstance(data_filepath, pd.DataFrame):
            self.df = pd.read_csv(data_filepath, header=0)
            self.df = self.df[self.df['kinds'] == kind].reset_index()
        else:
            self.df = data_filepath

        checkpoint = params['checkpoint']
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)
        self.length = params['num_sample']
        self.data = self.df['snippet']
        self.params = params
        self.pair_index = self.sampling()
        
    def sampling(self):
        labels_bug = self.df.index[self.df['is_defect']==1].tolist()
        labels_no_bug = self.df.index[self.df['is_defect']==0].tolist()

        selected_pairs = self.make_pairs(labels_bug, labels_bug, self.length/4, 1)
        selected_pairs = selected_pairs | self.make_pairs(labels_no_bug, labels_no_bug, self.length/4, 1)
        selected_pairs = selected_pairs | self.make_pairs(labels_bug, labels_no_bug, self.length/2, 0)
        
        return list(selected_pairs)

    def make_pairs(self, list1, list2, num_sample, label):
        pairs = set()
        
        while True:
            i = random.randint(0, len(list1)-1)
            j = random.randint(0, len(list2)-1)
            if list1[i] == list2[j]:
                continue
            item = [list1[i], list2[j]]
            item.sort()
            item.append(label)
            pairs.add(tuple(item))
            if len(pairs) == num_sample:
                break
        return pairs
                    
    def __len__(self):
        return len(self.pair_index)
    
    def __getitem__(self, index):
        with torch.no_grad():
            emb0 = self.model(self.tokenizer.encode(self.data.iloc[self.pair_index[index][0]],return_tensors="pt", truncation=True).to(device))[0]
            emb1 = self.model(self.tokenizer.encode(self.data.iloc[self.pair_index[index][1]],return_tensors="pt", truncation=True).to(device))[0]
        return emb0, emb1, torch.tensor(self.pair_index[index][2])
        
        
    
batch_size = 64
params = {'num_sample':40000, 'checkpoint':"Salesforce/codet5p-110m-embedding"}
train_dataset = PairDataset('../data/camel_filtered_data.csv', params, 'train')

In [14]:
import torch.optim as optim
import torch.nn as nn
# Siamse Networkモデルクラス
class SiameseModel(nn.Module):
    
    def __init__(self):
        super(SiameseModel, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(256, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256)
        )
    
    def forward_once(self, x):
        z = self.encoder(x)
        return z
  
    def forward(self, x1, x2):
        z1 = self.forward_once(x1)
        z2 = self.forward_once(x2)
        # print(z1)
        return z1, z2


# 損失関数
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, z1, z2, y):
        difference = z1 - z2
        distance_squared = torch.sum(torch.pow(difference, 2), 1)
        distance = torch.sqrt(distance_squared)       #平均：0.813，最大：1.663，最小：0.023，中央値：0.492
        negative_distance = self.margin - distance
        negative_distance = torch.clamp(negative_distance, min=0.0)
        loss = (y * distance_squared + (1 - y) * torch.pow(negative_distance, 2)) / 2.0
        loss = torch.sum(loss) / z1.size()[0]
        return loss


In [15]:
import copy
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

batch_size = 64
train_loader = DataLoader(
    train_dataset,                                                    # データセット
    batch_size=batch_size,                                            # バッチサイズ
    shuffle=True                                                      # データセットからランダムに取り出す
)
    
# モデルのインスタンス化
model = SiameseModel().to(device)                # GPUを使用するには「.to(device)」が必要
# 最適化関数の定義
optimizer = optim.SGD(model.parameters(), lr=0.05)    # パラメータ探索アルゴリズム=確率的勾配降下法(SGD), 学習率lr=0.05

# 損失関数のインスタンス化
criterion = ContrastiveLoss()                         # 引数として「margin=○○」が指定できる。デフォルト値は「margin=1.0」
# モデル学習
repeat = 10                                                       # 学習回数
losses = []                                                       # 表示用損失値配列

prev_model = copy.deepcopy(model)
prev_optimizer = copy.deepcopy(optimizer)
model.train()                                                     # 学習モード
for epoch in range(repeat): 
  print(f"epoch={epoch}")
  nan_count = 0
  normal_count = 0
  train_loss = 0
  n_train = 0

  for X1, X2, y in tqdm(train_loader):
    # モデルによる特徴ベクトル算出
    output1, output2 = model(X1.to(device), X2.to(device))

    # 損失関数の計算
    # print(output1, output2)
    loss = criterion(output1, output2, y.to(device))

    # 表示用lossデータの記録
    losses.append(loss.item())

    # 勾配を初期化
    optimizer.zero_grad()
    
    # 損失関数の値から勾配を求め誤差逆伝播による学習実行
    loss.backward()
    
    # 学習結果に基づきパラメータを更新
    optimizer.step()
    train_batch_size = len(y)
    train_loss += loss.item() * train_batch_size
    n_train += train_batch_size
  avg_train_loss = train_loss / n_train
  print(f'avg_train_loss:{avg_train_loss}')
  print(f"nan/normal: {nan_count}/{normal_count}")
plt.plot(losses)                                                  # loss値の推移を表示
torch.save(model.state_dict(), 'model.pth')

epoch=0


  0%|          | 0/625 [00:00<?, ?it/s]

KeyboardInterrupt: 