In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import japanize_matplotlib
from tqdm.notebook import tqdm
from IPython.display import display
from sklearn.model_selection import train_test_split

In [2]:
# warning表示off
import warnings
warnings.simplefilter('ignore')

# デフォルトフォントサイズ変更
plt.rcParams['font.size'] = 14

# デフォルトグラフサイズ変更
plt.rcParams['figure.figsize'] = (6,6)

# デフォルトで方眼表示ON
plt.rcParams['axes.grid'] = True

# numpyの表示桁数設定
np.set_printoptions(suppress=True, precision=5)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Subset
from sklearn.model_selection import KFold
from torch.utils.data.sampler import WeightedRandomSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.multiprocessing as mp

from learning_tool import *
import sys
sys.path.append('..')

from model.model import *

In [4]:
# デバイスの割り当て

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
# device = torch.device('cpu')

cuda:0


In [5]:
# PyTorch乱数固定用

def torch_seed(seed=123):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True
    
# 乱数初期化
torch_seed()

In [6]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, df, features, labels):
        self.features_values = df[features].values
        self.labels = df[labels].values
        
    # len()を使用すると呼ばれる
    def __len__(self):
        return len(self.features_values)

    # 要素を参照すると呼ばれる関数    
    def __getitem__(self, idx):
        features_x = torch.LongTensor(self.features_values[idx])
        labels = torch.as_tensor(self.labels[idx])
        return features_x, labels

In [7]:
# 出力次元数
# 分類先クラス数　今回は2になる
n_output = 2

# 隠れ層のノード数
n_hidden = 100

# 結果確認
print(f'n_hidden: {n_hidden} n_output: {n_output}')

n_hidden: 100 n_output: 2


In [8]:
net = CNN(n_output, n_hidden).to(device)
print(net)

CNN(
  (relu): ReLU(inplace=True)
  (embedding): Embedding(2001, 128)
  (conv1): Conv2d(1, 128, kernel_size=(3, 128), stride=(1, 1))
  (maxpool1): MaxPool2d(kernel_size=(1998, 1), stride=(1998, 1), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(1, 128, kernel_size=(4, 128), stride=(1, 1))
  (maxpool2): MaxPool2d(kernel_size=(1997, 1), stride=(1997, 1), padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(1, 128, kernel_size=(5, 128), stride=(1, 1))
  (maxpool3): MaxPool2d(kernel_size=(1996, 1), stride=(1996, 1), padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (l1): Linear(in_features=384, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (features1): Sequential(
    (0): Conv2d(1, 128, kernel_size=(3, 128), stride=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=(1998, 1), stride=(1998, 1), padding=0, dilation=1, ceil_mode=False)
  )
  (features2): Sequential(
    (0): Conv2d(1, 128, kernel_

In [9]:
def make_weighted_random_sampler(train_set):
    numDataPoints = len(train_set)
    data_dim = len(train_set[0][0])

    data = torch.FloatTensor(numDataPoints, data_dim)
    target = np.zeros(0)
    target = (np.hstack(data[1].numpy() for data in train_set))
    # target
    # print(target)
    print ('target train 0/1: {}/{}'.format(
        len(np.where(target == 0)[0]), len(np.where(target == 1)[0])))

    class_sample_count = np.array(
        [len(np.where(target == t)[0]) for t in np.unique(target)])
    weight = 1. / class_sample_count
    samples_weight = np.array([weight[t] for t in target])

    samples_weight = torch.from_numpy(samples_weight)
    samples_weight = samples_weight.double()
    # print(samples_weight)
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight), replacement=True)
    return sampler

In [10]:
def make_dataset(filename):
    df = pd.read_csv(filename, index_col=0)
    columns = df.columns.values
    features_columns = columns[:-1]
    labels_column = columns[-1]
    dataset = MyDataset(df, features_columns, labels_column)
    return dataset

In [11]:
def record_history(df, filename, seed, each_history, num_splits):
    # print(df)
    score_type = ['train_loss','train_acc','test_loss','test_acc','auc_score']
    for num_kf in range(num_splits):
        for i in range(5):
            tmp_list = list(each_history[num_kf,:,i+1])
            tmp_list[:0] = [filename, seed, num_kf+1, score_type[i]]
            # print(tmp_list)
            df.loc[len(df)] = tmp_list
    # print(df)

In [12]:
# class BaseModel(pl.LightningModule):
#     def __init__(self, model, classes, lr):
#         super().__init__()
#         self.model = model(num_classes=2, weights=None)
#         self.loss = nn.CrossEntropyLoss()
#         self.save_hyperparameters()
        
#     def forward(self, x):
#         return self.model(x)
    
#     def training_step(self, batch):
        
#     def configure_optimizers(self):
#         return optim.Adam(self.parameters(), lr=lr=self.hparams.lr, weight_decay=0.001)

IndentationError: expected an indented block (1401245152.py, line 13)

In [13]:
# 学習率
lr = 0.0001

# 繰り返し回数
num_epochs = 50

# 評価結果記録用
cv_history = np.zeros((0,6))

batch_size = 66
num_splits = 10

columns_list = [i for i in range(1,num_epochs+1)]
columns_list[:0] = ['filename','seed', 'num_kf', 'score_type']
columns_list
df = pd.DataFrame(columns=columns_list)

for i in range(10,11):
    filename = 'txt_vec_and_label_camel_'+str(i)+'.csv'
    copy_df = df.copy()
    path = '../resource/' + filename
    dataset = make_dataset(path)
    print(filename)
    
    random_seed_list = [100,101,102,103,104,105,106,107,108,109,110]
    # random_seed_list = [100]
    # random_seed_list = [102,103,104,105,106,107,108,109]
    
    for seed in random_seed_list:
        print(seed)
        kf = KFold(n_splits=num_splits, shuffle=True, random_state=seed)
        cv_cnt = 0
        torch_seed(seed)
        each_history = np.zeros((0,num_epochs,6))

        for train_index, test_index in kf.split(dataset):
            cv_cnt += 1
            print(f'cv: {cv_cnt}')
            history = np.zeros((0,6))
            train_dataset = Subset(dataset, train_index)
            sampler = make_weighted_random_sampler(train_dataset)
            train_loader = DataLoader(train_dataset, batch_size, sampler=sampler, num_workers=2)
            test_dataset   = Subset(dataset, test_index)
            test_loader = DataLoader(test_dataset, batch_size, shuffle=False, num_workers=2)
            
            # モデルインスタンス生成
            net = CNN(n_output, n_hidden).to(device)
            dist.init_process_group(backend='nccl')
            model = DDP(model, device_ids=[torch.cuda.current_device()])
            
            # 最適化関数: 勾配降下法
            optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=0.001)
            # optimizer = torch.optim.SGD(net.parameters(), lr=lr)
            # 損失関数： 交差エントロピー関数
            criterion = nn.CrossEntropyLoss()

            #学習
            history = fit(net, optimizer, criterion, num_epochs, train_loader, test_loader, device, history, test_dataset)

            #1交差ごとの記録
            each_history = np.vstack((each_history, [history]))
            
        record_history(copy_df, filename, seed, each_history, num_splits)
        print(copy_df)
    copy_df.to_csv('../result/threshold_2000_epoch_50_cv_10_2_weighted_random_camel_'+str(i)+'.csv', mode='w',index=False)

txt_vec_and_label_camel_10.csv
100
cv: 1
target train 0/1: 14699/8626


ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set

In [None]:
# each_history

In [None]:
# copy_df.to_csv('../result/threshold_2000_epoch_50_cv_10_seed_101_weighted_random_camel_'+str(i)+'.csv', mode='w',index=False)

In [None]:
evaluate_history(each_history[0,:,:])

In [None]:
# np.save('../result/camel_3_cv_5_epoch_10.npy',each_history)

In [None]:
# np.save('../result/camel_3_cv_5_cv_hisotry.npy', cv_history)

In [None]:
# torch.save(net.state_dict(), '../model_weight.pth')

In [None]:
# cv_history = np.zeros((0,0,5))
# cv_history.shape

In [None]:
# cv_history = np.zeros((0,5))


In [None]:
# cv_cnt = 1
# num_epoch = 10
# each_history = np.zeros((0,num_epoch,5))
# history = np.zeros((0,5))
# item = np.arange(5).reshape(1,5)
# history = np.vstack((history, item))
# history
# history = np.arange(num_epoch*5).reshape((num_epoch,5))
# each_history = np.vstack((each_history, [history]))
# each_history

In [None]:
# history = np.arange(num_epoch*5).reshape((num_epoch,5))
# each_history = np.vstack((each_history, [history]))
# each_history

In [None]:
# history = np.zeros((0,5))
# history