In [None]:
!pip install pandas 
!pip install scikit-learn 
!pip install numba 
!pip install tqdm 
!pip install matplotlib 
!pip install transformers
!pip install rank-bm25

## 라이브러리 Import

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import random
import os, re

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from tqdm import tqdm

from transformers import logging
from transformers import AutoTokenizer, RobertaForSequenceClassification, AutoModelForSequenceClassification

import matplotlib as mpl
import matplotlib.pyplot as plt

logging.set_verbosity_error()
import warnings
warnings.filterwarnings(action='ignore')

from itertools import combinations
from rank_bm25 import BM25L

import time
import datetime
import pickle
import argparse

# 데이터 생성
- 월간 데이콘 코드 유사성 판단 AI 경진대회 시즌 1 - 청소님 코드 참고하였습니다!
- 전처리 코드 CPP에 맞게 수정하였습니다
- 약 1억 건의 데이터가 생성됩니다

## 데이터 불러오기

In [None]:
# 데이터 자동으로 불러오기 위한 숫자 생성(001~500)
numbers = [str(i).zfill(3) for i in range(1, 501)]
all_code_list = []

# train_code(cpp) 불러오기
for i in numbers:
    cpp_code_list = []
    
    for file in os.listdir(f"./data/train_code/problem{i}"):
    
        if file.endswith(".cpp"):
            with open(os.path.join(f"./data/train_code/problem{i}", file), "r") as f:
                cpp_code_list.append(f.read())
    all_code_list.append(cpp_code_list)

In [None]:
# 전처리 함수 1_주석 처리 
def clean_data(text):
    text = text.strip()
    text = re.sub(r"//.*", "", text)
    text = re.sub(r'/\*.*?\*/', '', text, flags=re.DOTALL)
    text = text.strip()
    return text

# 전처리 함수 2_빈 줄 제거
def get_rid_of_empty(c):
    ret = []
    splitted = c.split('\n')
    for s in splitted:
        if len(s.strip()) > 0:
            ret.append(s)
    return '\n'.join(ret)

In [None]:
# 전처리 실행
all_code_list_clean = []

for i in range(500):
    cleans = []
    for j in range(500):
        clean = get_rid_of_empty(clean_data(all_code_list[i][j]))
        cleans.append(clean)
    all_code_list_clean.append(cleans)

In [None]:
# 데이터 셋 생성 config 및 함수

# config
class cfg():
    def __init__(self) :
        self.checkpoint_path = 'neulab/codebert-cpp'
        # self.learning_rate = 3e-4
        # self.epochs = 5
        # self.num_labels=2
        # self.batch_size=16

args = cfg()

# 함수
def get_pairs(input_df, tokenizer):
    codes = input_df['code'].to_list()
    problems = input_df['problem_num'].unique().tolist()
    problems.sort()

    tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
    bm25 = BM25L(tokenized_corpus)

    total_positive_pairs = []
    total_negative_pairs = []

    for problem in tqdm(problems):
        solution_codes = input_df[input_df['problem_num'] == problem]['code']
        positive_pairs = list(combinations(solution_codes.to_list(),2))

        solution_codes_indices = solution_codes.index.to_list()
        negative_pairs = []

        first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
        negative_code_scores = bm25.get_scores(first_tokenized_code)
        negative_code_ranking = negative_code_scores.argsort()[::-1] # 내림차순
        ranking_idx = 0

        for solution_code in solution_codes:
            negative_solutions = []
            while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
                high_score_idx = negative_code_ranking[ranking_idx]

                if high_score_idx not in solution_codes_indices:
                    negative_solutions.append(input_df['code'].iloc[high_score_idx])
                ranking_idx += 1

            for negative_solution in negative_solutions:
                negative_pairs.append((solution_code, negative_solution))

        total_positive_pairs.extend(positive_pairs)
        total_negative_pairs.extend(negative_pairs)

    pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
    pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

    neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
    neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

    pos_label = [1]*len(pos_code1)
    neg_label = [0]*len(neg_code1)

    pos_code1.extend(neg_code1)
    total_code1 = pos_code1
    pos_code2.extend(neg_code2)
    total_code2 = pos_code2
    pos_label.extend(neg_label)
    total_label = pos_label
    pair_data = pd.DataFrame(data={
        'code1':total_code1,
        'code2':total_code2,
        'similar':total_label
    })
    pair_data = pair_data.sample(frac=1).reset_index(drop=True)
    return pair_data

In [None]:
# 학습 데이터 셋 만들기 (각 코드 : 문제 번호)
preproc_scripts = []
problem_nums = []

for i in range(500):
    for k in range(500):
        preprocessed_script = all_code_list_clean[i][k]
        preproc_scripts.append(preprocessed_script)
        problem_nums.append(numbers[i])
        
data_df = pd.DataFrame(data={'code': preproc_scripts, 'problem_num': problem_nums})

In [None]:
train_code, valid_code, train_label, valid_label = train_test_split(
    data_df,
    data_df['problem_num'],
    random_state=42,
    test_size=0.1,
    stratify=data_df['problem_num']
)

train_code = train_code.reset_index(drop=True)
valid_code = valid_code.reset_index(drop=True)

tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path)
tokenizer.truncation_side = 'left'

final_train_df = get_pairs(train_code, tokenizer)
# final_valid_df = get_pairs(valid_code, tokenizer)

# 생성 데이터 저장
final_train_df.to_pickle("./data/train.pkl")
# final_valid_df.to_pickle("./data/val.pkl")

In [None]:
# 테스트 데이터 셋 만들기
test_df = pd.read_csv("./data/test.csv")

code1 = test_df['code1'].values
code2 = test_df['code2'].values
processed_code1 = []
processed_code2 = []
for i in range(len(code1)):
        processed_c1 = get_rid_of_empty(clean_data(code1[i]))
        processed_c2 = get_rid_of_empty(clean_data(code2[i]))
        processed_code1.append(processed_c1)
        processed_code2.append(processed_c2)

processed_test = pd.DataFrame(list(zip(processed_code1, processed_code2)), columns=["code1", "code2"])
processed_test.to_pickle("./data/processed_test.pkl")

## 데이터 분할

In [None]:
# 데이터 불러오기

train_data = pd.read_pickle("./data/train.pkl")
# val_data = pd.read_pickle("./data/val.pkl")

In [None]:
# 데이터 추출(랜덤)
def data_splitter(df, size):
    label_0_df = df[df['similar'] == 0].sample(size // 2)
    label_0_idx = label_0_df.index

    label_1_df = df[df['similar'] == 1].sample(size // 2)
    label_1_idx = label_1_df.index

    sampled_df = pd.concat([label_0_df, label_1_df], axis=0)
    sampled_idx = list(label_0_idx) + list(label_1_idx)

    return sampled_idx, sampled_df

# 추출한 후 데이터 셋을 초기화
def splitted_original(origin, idx):
    origin = origin.drop(idx)
    return origin

In [None]:
train_size = 5000000

for i in range(1, 9):
    train_idx, train_df = data_splitter(train_data, train_size)
    train_data = splitted_original(train_data, train_idx)
    train_df.to_pickle(f'./data/train{i}.pkl')

In [None]:
# val_size = 100000

# valid_idx, valid_df = data_splitter(val_data, val_size)
# val_data = splitted_original(val_data, valid_idx)
# valid_df.to_pickle(f'./data/valid.pkl')

# Train
- 모델은 neulab/codebert-cpp입니다.
- 다른 데이터 8개로 모델 파인 튜닝시킵니다.
- DataParallel 시 loss 부분이랑 모델 저장 부분에서 주의해야합니다

In [None]:
device = torch.device('cuda')if torch.cuda.is_available() else torch.device('cpu')
device

## 하이퍼 파라미터 설정

In [None]:
import os
os.getcwd()

In [None]:
class config():
    def __init__(self):

        self.source_len=512
        self.epochs = 1
        self.learning_rate=2e-5
        self.batch_size=32
        self.shuffle = True
        self.seed=2022
        self.num_labels=2
        self.checkpoint_path = 'neulab/codebert-cpp'
        self.train_path1 = './data/train1.pkl'
        # self.train_path2 = './data/train2.pkl'
        # self.train_path3 = './data/train3.pkl'
        # self.train_path4 = './data/train4.pkl'
        # self.train_path5 = './data/train5.pkl'
        # self.train_path6 = './data/train6.pkl'
        # self.train_path7 = './data/train7.pkl'
        # self.train_path8 = './data/train8.pkl'
        
        # self.hf_data_path1= 'emaeon/train1'
        # self.hf_data_path2= 'emaeon/train2'
        # self.hf_data_path3= 'emaeon/train3'
        # self.hf_data_path4= 'emaeon/train4'
        # self.hf_data_path5= 'emaeon/train5'
        # self.hf_data_path6= 'emaeon/train6'
        # self.hf_data_path7= 'emaeon/train7'
        # self.hf_data_path8= 'emaeon/train8'

cfg = config()

## 랜덤 시드 고정

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED']=str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic=True
    torch.backends.cudnn.bechmark = True

seed_everything(cfg.seed) #seed 고정

## 모델, 토크나이저 호출

In [None]:
# 허깅페이스에서 사전학습된 모델 불러옵니다
model = AutoModelForSequenceClassification.from_pretrained(cfg.checkpoint_path, num_labels=cfg.num_labels, output_hidden_states=False).to(device)
tokenizer = AutoTokenizer.from_pretrained(cfg.checkpoint_path,)

tokenizer.truncation_side = "left"
model.resize_token_embeddings(len(tokenizer))
'''DataParallel이 필요할 경우에는 아래 코드를 실행해야 합니다'''
# model = nn.DataParallel(model).to(device) 

In [None]:
model #모델 확인

## Dataset 커스터마이징

In [None]:
class CustomDataset(Dataset):

    def __init__(self, data_a,data_b, labels, tokenizer, source_len) :
    # 내가 필요한 것들을 가져와서 선처리
        self.data_a = data_a.copy()
        self.data_b = data_b.copy()
        self.labels = labels.copy()
        self.tokenizer = tokenizer
        self.source_len = source_len

    def __getitem__(self, index) :
    # 데이터 셋에서 한 개의 데이터를 가져오는 함수 정의

        text1 = self.data_a[index]
        text2 = self.data_b[index]
        
        '''text_pair에 비교할 문장을 입력하면 알아서 sep토큰으로 문장 구분된 하나의 입력 셋이 형성됩니다.'''
        inputs = self.tokenizer(text = text1,text_pair=text2,max_length=self.source_len,padding='max_length',truncation=True, return_tensors='pt') 
        label = self.labels[index]

        '''neulab/codebert-cpp는 input_ids와 attention_mask를 입력 받습니다'''
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        inputs_dict = {
            'input_ids' : input_ids.to(device, dtype = torch.long),
            'attention_mask' : attention_mask.to(device, dtype = torch.long),
        }
        label = torch.tensor(label).to(device, dtype = torch.long)


        return inputs_dict, label #

    def __len__(self) :
    # 데이터 셋의 길이
        return len(self.labels)

In [None]:
traindf = pd.read_pickle(cfg.train_path1)
# traindf = pd.read_pickle(cfg.train_path2)
# traindf = pd.read_pickle(cfg.train_path3)
# traindf = pd.read_pickle(cfg.train_path4)
# traindf = pd.read_pickle(cfg.train_path5)
# traindf = pd.read_pickle(cfg.train_path6)
# traindf = pd.read_pickle(cfg.train_path7)
# traindf = pd.read_pickle(cfg.train_path8)


In [None]:
"""허깅페이스에 올려놓은 데이터를 가져오려면 다음과 같이 해야 합니다"""
# from datasets import load_dataset
# dataset = load_dataset(cfg.hf_data_path1)
# traindf = dataset['train'].to_pandas()


# dataset = load_dataset(cfg.hf_data_path2)
# dataset = load_dataset(cfg.hf_data_path3)
# dataset = load_dataset(cfg.hf_data_path4)
# dataset = load_dataset(cfg.hf_data_path5)
# dataset = load_dataset(cfg.hf_data_path6)
# dataset = load_dataset(cfg.hf_data_path7)
# dataset = load_dataset(cfg.hf_data_path8)

In [None]:
traindf

In [None]:
train_data = CustomDataset(data_a=list(traindf['code1']),data_b=list(traindf['code2']),
                           labels=list(traindf['similar']),tokenizer=tokenizer,source_len=cfg.source_len)
train_loader = DataLoader(train_data, batch_size=cfg.batch_size, shuffle=cfg.shuffle,num_workers=0)

# Train 함수 정의

In [None]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def train(epoch, model, optimizer, loader):

    model.train()
    total_loss, total_accuracy = 0,0
    nb_train_steps = 0
    
    for _,(inputs, labels) in tqdm(enumerate(loader, 0)):
        
        outputs = model(**inputs, labels = labels)
        loss = outputs.loss
        '''DataParallel은 loss가 3개가 나오기 때문에 평균해야 합니다'''
        # loss = outputs.loss.mean()

        pred = [logit.argmax().cpu().detach().item() for logit in outputs.logits]
        true = [label for label in labels.cpu().numpy()]
        acc = accuracy_score(true,pred)

        if _ % 50 == 0 and not _ == 0: #50iter마다 loss 확인하고자 넣었습니다.
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(_, len(loader), elapsed))
            print('  current average loss = {}'.format(
                total_loss / _))
        if _%50==0  and not _ == 0 :
            print(f'Epoch : {epoch}, train_{_}_step_loss : {loss.item()}')
            psuedo_pred = [logit.argmax().item() for logit in outputs.logits]
            psuedo_acc = np.sum(np.array(labels.to('cpu'))==np.array(psuedo_pred))/len(labels)
            print(f'{epoch}_{_}_step_정확도 :{psuedo_acc}')
        if _%15625==0 and not _ == 0: #런타임 오류가 생길 경우 모델이 날라갈 것을 방지하고자 했습니다.
            torch.save(model.state_dict(), f'/data/{_}batch_trained_cppbert1.pt')
            '''DataParallel후 저장하는 방법이 약간 다릅니다'''
            # torch.save(model.module.state_dict(), f'/data/{_}batch_trained_cppbert.pt')


        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        total_accuracy += acc
        nb_train_steps += 1

    avg_loss = total_loss/len(loader)
    avg_acc = total_accuracy/nb_train_steps
    t_test_avg_acc = total_accuracy/len(loader)
    print(f'Epoch:{epoch}, train_{_}_stepLoss:{avg_loss}')
    print(f'Epoch:{epoch}, train_{_}_stepacc:{avg_acc}')
    print(f'Epoch:{epoch}, train_{_}_stepacc:{t_test_avg_acc}')
    loss_dic['train_loss'].append(avg_loss)
    loss_dic['train_acc'].append(avg_acc)

# RUN

In [None]:
optimizer = torch.optim.AdamW(params = model.parameters(), lr=cfg.learning_rate)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=0.01, last_epoch=-1)

In [None]:
loss_dic = {'epoch':[],'train_loss':[], 'validation_loss':[],'train_acc':[],'val_acc':[]}



for epoch in tqdm(range(1,cfg.epochs+1)):
    t0 = time.time()
    train(epoch, model, optimizer, train_loader)
    torch.save(model.state_dict(), './data/trained_cppbert1.pt')
    # torch.save(model.state_dict(), './data/trained_cppbert2.pt')
    # torch.save(model.state_dict(), './data/trained_cppbert3.pt')
    # torch.save(model.state_dict(), './data/trained_cppbert4.pt')
    # torch.save(model.state_dict(), './data/trained_cppbert5.pt')
    # torch.save(model.state_dict(), './data/trained_cppbert6.pt')
    # torch.save(model.state_dict(), './data/trained_cppbert7.pt')
    # torch.save(model.state_dict(), './data/trained_cppbert8.pt')
    
    '''DataParallel후 저장하는 방법이 약간 다릅니다'''
    # torch.save(model.module.state_dict(), './data/trained_cppbert1.pt')  

    scheduler.step()


# Test
- 월간 데이콘 코드 유사성 판단 AI 경진대회 시즌 1 - Gmin47님 코드 참고하였습니다!
- Soft Voting을 위해 Logit값을 저장해야합니다.

In [None]:
device = torch.device('cuda')if torch.cuda.is_available() else torch.device('cpu')
device

## 하이퍼 파라미터 설정

In [None]:
class config():
    def __init__(self):

        self.source_len=512
        self.batch_size=16
        self.shuffle = True
        self.seed=2022
        self.num_labels=2
        self.load_path1= './data/trained_cppbert1.pt'
        # self.load_path2= './data/trained_cppbert2.pt'
        # self.load_path3= './data/trained_cppbert3.pt'
        # self.load_path4= './data/trained_cppbert4.pt'
        # self.load_path5= './data/trained_cppbert5.pt'
        # self.load_path6= './data/trained_cppbert6.pt'
        # self.load_path7= './data/trained_cppbert7.pt'
        # self.load_path8= './data/trained_cppbert8.pt'
        
        # self.hf_load_path1= 'emaeon/trained_cppbert1'
        # self.hf_load_path1= 'emaeon/trained_cppbert2'
        # self.hf_load_path1= 'emaeon/trained_cppbert3'
        # self.hf_load_path1= 'emaeon/trained_cppbert4'
        # self.hf_load_path1= 'emaeon/trained_cppbert5'
        # self.hf_load_path1= 'emaeon/trained_cppbert6'
        # self.hf_load_path1= 'emaeon/trained_cppbert7'
        # self.hf_load_path1= 'emaeon/trained_cppbert8'
        
        self.checkpoint_path = 'neulab/codebert-cpp'
        self.test_path = './data/processed_test.pkl'
        
cfg = config()

## 모델, 토크나이저 호출

In [None]:
from transformers import AutoTokenizer, RobertaForSequenceClassification, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(cfg.checkpoint_path, num_labels=cfg.num_labels, output_hidden_states=False,ignore_mismatched_sizes=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(cfg.checkpoint_path)
tokenizer.truncation_side = "left"
model.resize_token_embeddings(len(tokenizer))

In [None]:
model.load_state_dict(torch.load(cfg.load_path1))
# model.load_state_dict(torch.load(cfg.load_path2))
# model.load_state_dict(torch.load(cfg.load_path3))
# model.load_state_dict(torch.load(cfg.load_path4))
# model.load_state_dict(torch.load(cfg.load_path5))
# model.load_state_dict(torch.load(cfg.load_path6))
# model.load_state_dict(torch.load(cfg.load_path7))
# model.load_state_dict(torch.load(cfg.load_path8))
model.eval()


In [None]:
"""local이 아닌 허깅페이스에 저장된 모델 불러오려면 다음과 같이 해야 합니다"""
# model = AutoModelForSequenceClassification.from_pretrained(cfg.hf_load_path1, num_labels=cfg.num_labels, output_hidden_states=False).to(device)
# tokenizer = AutoTokenizer.from_pretrained(cfg.checkpoint_path,)

# tokenizer.truncation_side = "left"
# model.resize_token_embeddings(len(tokenizer))
'''DataParallel이 필요할 경우에는 아래 코드를 실행해야 합니다'''
# model = nn.DataParallel(model).to(device) 

# Test data tokenizing

In [None]:
test_data = pd.read_pickle(cfg.test_path)

In [None]:
c1 = test_data['code1'].values
c2 = test_data['code2'].values

N = test_data.shape[0]
MAX_LEN = 512

test_input_ids = np.zeros((N, MAX_LEN), dtype=int)
test_attention_masks = np.zeros((N, MAX_LEN), dtype=int)

for i in tqdm(range(N), position=0, leave=True):
    try:
        cur_c1 = str(c1[i])
        cur_c2 = str(c2[i])
        encoded_input = tokenizer(cur_c1, cur_c2, return_tensors='pt', max_length=512, padding='max_length',
                                    truncation=True)
        test_input_ids[i,] = encoded_input['input_ids']
        test_attention_masks[i,] = encoded_input['attention_mask']

    except Exception as e:
        print(e)
        pass

test_input_ids = torch.tensor(test_input_ids, dtype=int)
test_attention_masks = torch.tensor(test_attention_masks, dtype=int)



In [None]:
'''토큰화 작업에 시간이 오래걸려 토큰 파일만 따로 저장해두고 불러와서 추론했습니다'''
torch.save(test_input_ids, "./data/test_input_ids.pt")
torch.save(test_attention_masks, "./data/test_attention_masks.pt")


In [None]:
test_input_ids=torch.load('./data/test_input_ids.pt')
test_attention_masks=torch.load('./data/test_attention_masks.pt')

## Inference

In [None]:
# model.cuda()

test_tensor = TensorDataset(test_input_ids, test_attention_masks)
test_sampler = SequentialSampler(test_tensor)
test_dataloader = DataLoader(test_tensor, sampler=test_sampler, batch_size=16)

submission = pd.read_csv("./data/sample_submission.csv")

logits_list = [] #soft voting을 위한 리스트입니다
preds = np.array([]) #최종 출력값(레이블)입니다


for step, batch in tqdm(enumerate(test_dataloader), desc="Iteration", smoothing=0.05):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)
    
    '''soft voting을 위한 logit값'''
    logits = outputs[0]
    logits = logits.detach().cpu()
    logits_list.append(logits)
    
    '''최종 출력label(0 & 1)'''
    _pred = logits.numpy()
    pred = np.argmax(_pred, axis=1).flatten()
    preds = np.append(preds, pred)
    
submission['similar'] = preds
all_logits = torch.cat(logits_list, dim=0)

In [None]:
torch.save(all_logits, "./data/all_logits_model1.pt")
# torch.save(all_logits, "./data/all_logits_model2.pt")
# torch.save(all_logits, "./data/all_logits_model3.pt")
# torch.save(all_logits, "./data/all_logits_model4.pt")
# torch.save(all_logits, "./data/all_logits_model5.pt")
# torch.save(all_logits, "./data/all_logits_model6.pt")
# torch.save(all_logits, "./data/all_logits_model7.pt")
# torch.save(all_logits, "./data/all_logits_model8.pt")

In [None]:
submission.to_csv('./data/submission' +'model1.csv', index=False)
# submission.to_csv('./data/submission' +'model2.csv', index=False)
# submission.to_csv('./data/submission' +'model3.csv', index=False)
# submission.to_csv('./data/submission' +'model4.csv', index=False)
# submission.to_csv('./data/submission' +'model5.csv', index=False)
# submission.to_csv('./data/submission' +'model6.csv', index=False)
# submission.to_csv('./data/submission' +'model7.csv', index=False)
# submission.to_csv('./data/submission' +'model8.csv', index=False)

## Ensemble(Soft Voting)

- Soft Voting 전 Logit 값들을 모두 Softmax 통과 시킵니다.

In [None]:
import torch
import numpy as np
import pandas as pd

submission = pd.read_csv("./data/sample_submission.csv")

logits_1 = torch.nn.functional.softmax(torch.load("./data/all_logits_model1.pt"))
logits_2 = torch.nn.functional.softmax(torch.load("./data/all_logits_model2.pt"))
logits_3 = torch.nn.functional.softmax(torch.load("./data/all_logits_model3.pt"))
logits_4 = torch.nn.functional.softmax(torch.load("./data/all_logits_model4.pt"))
logits_5 = torch.nn.functional.softmax(torch.load("./data/all_logits_model5.pt"))
logits_6 = torch.nn.functional.softmax(torch.load("./data/all_logits_model6.pt"))
logits_7 = torch.nn.functional.softmax(torch.load("./data/all_logits_model7.pt"))
logits_8 = torch.nn.functional.softmax(torch.load("./data/all_logits_model8.pt"))

In [None]:
logits = (logits_1 + logits_2 + logits_3 + logits_4 + logits_5 + logits_6 + logits_7 + logits_8) / 8
logits_np = logits.numpy()
pred = np.argmax(logits_np, axis=1).flatten()

In [None]:
submission['similar'] = pred
submission.to_csv('./data/final_soft_pred.csv', index=False)