In [1]:
import sys
import argparse
import torch
import logging
from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2

dataset_path = '/opt/ml/input/data'
train_path = dataset_path + '/train.json'
val_path = dataset_path + '/val.json'
test_path = dataset_path + '/test.json'

def __get_logger():
    """로거 인스턴스 반환
    """

    __logger = logging.getLogger('logger')

    # # 로그 포멧 정의
    formatter = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # 스트림 핸들러 정의
    stream_handler = logging.StreamHandler()
    # 각 핸들러에 포멧 지정
    stream_handler.setFormatter(formatter)
    # 로거 인스턴스에 핸들러 삽입
    __logger.addHandler(stream_handler)
    # 로그 레벨 정의
    __logger.setLevel(logging.DEBUG)

    return __logger

logger = __get_logger()


def collate_fn(batch):
    return tuple(zip(*batch))

In [3]:
if torch.cuda.is_available():
    logger.info("*************************************")
    device = torch.device("cuda")
    logger.info(f'There are {torch.cuda.device_count()} GPU(s) available.')
    logger.info(f'We will use the GPU:{torch.cuda.get_device_name(0)}')
    logger.info("*************************************\n")
else:
    logger.info("*************************************")
    logger.info('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    logger.info("*************************************\n")

2021-05-08 05:15:47,521 - logger - INFO - *************************************
2021-05-08 05:15:47,522 - logger - INFO - There are 1 GPU(s) available.
2021-05-08 05:15:47,525 - logger - INFO - We will use the GPU:Tesla P40
2021-05-08 05:15:47,525 - logger - INFO - *************************************



In [4]:
import pandas as pd

data1 = pd.read_csv("/opt/ml/submission/ensemble_ch2.csv") # best model
data2 = pd.read_csv("/opt/ml/submission/hs_filter.csv")

In [5]:
data1.head()

Unnamed: 0,image_id,PredictionString
0,batch_01_vt/0021.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,batch_01_vt/0028.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,batch_01_vt/0031.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,batch_01_vt/0032.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,batch_01_vt/0070.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [6]:
data2.head()

Unnamed: 0,image_id,PredictionString
0,batch_01_vt/0021.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,batch_01_vt/0028.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,batch_01_vt/0031.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,batch_01_vt/0032.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,batch_01_vt/0070.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [7]:
len(data1.iloc[0]['PredictionString'])

131071

In [8]:
len(data2)

837

In [9]:
len(data2.iloc[0]['PredictionString'])

131071

In [15]:
submission = pd.read_csv('/opt/ml/submission/sample_submission.csv', index_col=None)

## 기존 모델 산출물에 이진분류 모델 filtering적용

In [18]:
from tqdm.notebook import tqdm

for i in tqdm(range(len(data1))):
    file_name = data1.iloc[i]['image_id']
    data1_str = data1.iloc[i]['PredictionString']
    data2_str = data2.iloc[i]['PredictionString']
    data1_str = data1_str.split(' ')
    data2_str = data2_str.split(' ')
    data1_str = np.array(data1_str, int) # 기준 모델
    data2_str = np.array(data2_str, int) # 필터

    
    data3_str = np.where(data2_str == 0, 0, data1_str)
    data3_str = data3_str.tolist()
    
    temp = (data1_str == data3_str)
    
    submission = submission.append({"image_id" : file_name, "PredictionString" : ' '.join(str(e) for e in data3_str)}, ignore_index=True)


In [105]:
submission.head()

Unnamed: 0,image_id,PredictionString
0,batch_01_vt/0021.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,batch_01_vt/0028.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,batch_01_vt/0031.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,batch_01_vt/0032.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,batch_01_vt/0070.jpg,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [106]:
len(submission.iloc[0]['PredictionString'])

131071

In [107]:
# submission.csv로 저장
submission.to_csv("/opt/ml/submission/" + "ensemble2_filter2_hs2" + ".csv", index=False)

In [67]:
tt = submission.iloc[0]['PredictionString'] == data1.iloc[0]['PredictionString']

In [68]:
tt # 최종 적용 확인작업

False