In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os
import re

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = "/content/drive/MyDrive/Python/semi_project/data/"
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
!pip install transformers



In [None]:
df = pd.read_csv(f"{DATA_PATH}train_new_1.csv")
df.head()

Unnamed: 0,title,content,tag,date
0,버섯 대체육이 근육 형성에 미치는 영향,버섯으로 만든 햄버거 패티미국 영양학 Nutrition 잡지에 게재된 새 연구에 따...,비건 채식 비건디저트 대체육 채식식단 비건식단 비건식당 비건레시피 비건음식 비건간식,2023. 4. 12.
1,비건 레시피 콩고기보다 세이탄 어묵 대체육,대체육 좋아하시나요 사실 저는 대체육보다 채소나 콩 곡류 자체의 맛을 즐기는 편이라...,비건레시피 세이탄 대체육 집밥,2023. 3. 14.
2,식물성 햄 대체육 아이간식 추천하는 베러미트 밀레 쿠킹클래스 다녀왔어요,식물성 햄 대체육 아이간식 추천하는 베러미트 밀레 쿠킹클래스 다녀왔어요 작년 언젠,인류건강 동물복지 지구환경에 지구환경 베러미트 밀레코리아 쿠킹클래스 대체육 아이간식...,2023. 4. 13.
3,이게 고기가 아니라고 미래의 먹거리 대체육에 대한 모든 것,안녕하세요 더운 여름날 다들 잘 보내고 계신가요 캠핑도 가고 바베큐도 즐기고 계실...,IT TECH 대체육 지구온난화 온실가스 환경문제 동물복지 ESG 건강 환경 동물,2023. 7. 10.
4,대체육 관련주 TOP 5 feat 대장주,2020년 코로나가 덮쳤고 세상은 한동안 공포에 뒤덮있었습니다 그 뒤로 사람들은 안...,식물성고기관련주 식물성대체육 대체육시장,2023. 5. 23.


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24105 entries, 0 to 24104
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    24105 non-null  object
 1   content  24105 non-null  object
 2   tag      24105 non-null  object
 3   date     20325 non-null  object
dtypes: object(4)
memory usage: 753.4+ KB


In [None]:
model_name = "team-lucid/deberta-v3-base-korean"

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from transformers import AutoModel
model = AutoModel.from_pretrained(model_name)
model

DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(64100, 768, padding_idx=0)
    (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
    (dropout): StableDropout()
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0-11): 12 x DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=768, out_features=768, bias=True)
            (key_proj): Linear(in_features=768, out_features=768, bias=True)
            (value_proj): Linear(in_features=768, out_features=768, bias=True)
            (pos_dropout): StableDropout()
            (dropout): StableDropout()
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            (dropout): StableDropout()
          )
        )
        (intermediat

In [None]:
train = df['title'] + ' ' + df['content']

In [None]:
train = tokenizer(train.tolist(), max_length=512, padding=True,truncation=True)

In [None]:
input_ids = np.array(train['input_ids'])
token_type_ids = np.array(train['token_type_ids'])
attention_mask = np.array(train['attention_mask'])

input_ids.shape, token_type_ids.shape, attention_mask.shape

((24105, 512), (24105, 512), (24105, 512))

In [None]:
input = input_ids.copy()

In [None]:
# CLS 토큰으로 1인 값 0으로 치환
for i in range(input.shape[0]):
    input[i][input[i] == 1] = 0

In [None]:
def tostr_tag_token(tag):
    tag_list = [tag.split(' ') for tag in tag] # 문자열인 테그를 공백을 기준으로 나눔
    tag_token_str_list = [] # 테그 문자열을 담을 리스트 초기화
    for tags in tag_list:
        tag_token_list = [] # 각 text 마다 토큰화한 테그를 담을 리스트 초기화
        for tag in tags:
            tag_token = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tag))
            tag_token = ' '.join(map(str,tag_token)) # 토큰화한 테그를 문자열로 변환
            tag_token_list.append(tag_token)
        tag_token_list = list(filter(None, tag_token_list)) # 빈 문자열 제거
        tag_token_list = sorted(tag_token_list, key=lambda x: len(x), reverse=True) # 토큰화한 테그를 내림차순으로 정렬
        tag_token_str_list.append(tag_token_list)
    return tag_token_str_list

In [None]:
def target_data(train, tag, max_length=512):
    target = np.empty((0, max_length), int) # 빈 target 배열 생성
    for text_token, tag_token in tqdm(zip(train, tag), total=train.shape[0]):
        text_token = ' '.join(map(str, text_token)) # text를 각각 가져와 문자열로 변환
        # 각 문장에 해당하는 테그를 가져와 순서대로 해당하는 열을 1로 변환
        for token in tag_token:
            text_token = re.sub(f"{token}\s", '1 '*len(token.split(' ')), text_token)
        text_token = list(map(int, text_token.split(' '))) # 변환된 문자열을 공백을 기준으로 나눔
        text_token = np.array(text_token)
        text_token = (text_token == 1) + 0 # 1을 제외한 숫자를 0으로 치환
        target = np.append(target, text_token.reshape(1, -1), axis=0)
    return target

In [None]:
tag = tostr_tag_token(df['tag'])
target = target_data(input, tag)
target.shape

  0%|          | 0/24105 [00:00<?, ?it/s]

(24105, 512)

In [None]:
notag_list = []
for i in range(target.shape[0]):
    if not target[i].sum():
        notag_list.append(i)

In [None]:
len(notag_list)

2445

In [None]:
input_ids = np.delete(input_ids, notag_list, axis=0)
token_type_ids = np.delete(token_type_ids, notag_list, axis=0)
attention_mask  = np.delete(attention_mask , notag_list, axis=0)
target = np.delete(target, notag_list, axis=0)

input_ids.shape, token_type_ids.shape, attention_mask.shape, target.shape

((21660, 512), (21660, 512), (21660, 512), (21660, 512))

##### 데이터셋 나누기

In [None]:
def permutation_train_test_split(input_ids, token_type_ids, attention_mask, target, test_size=0.2, shuffle=True, random_state=SEED):
    test_num = int(input_ids.shape[0] * test_size)
    train_num = input_ids.shape[0] - test_num

    if shuffle:
        np.random.seed(random_state)
        shuffled = np.random.permutation(input_ids.shape[0])

        input_ids = input_ids[shuffled,:]
        token_type_ids = token_type_ids[shuffled,:]
        attention_mask = attention_mask[shuffled,:]
        target = target[shuffled,:]

        train_input_ids = input_ids[:train_num]
        train_token_type_ids = token_type_ids[:train_num]
        train_attention_mask = attention_mask[:train_num]

        test_input_ids = input_ids[train_num:]
        test_token_type_ids = token_type_ids[train_num:]
        test_attention_mask = attention_mask[train_num:]

        train_target = target[:train_num]

        test_target = target[train_num:]

    else:
        train_input_ids = input_ids[:train_num]
        train_token_type_ids = token_type_ids[:train_num]
        train_attention_mask = attention_mask[:train_num]

        test_input_ids = input_ids[train_num:]
        test_token_type_ids = token_type_ids[train_num:]
        test_attention_mask = attention_mask[train_num:]

        train_target = target[:train_num]

        test_target = target[train_num:]

    return train_input_ids, train_token_type_ids, train_attention_mask, test_input_ids, test_token_type_ids, test_attention_mask, train_target, test_target

In [None]:
train_input_ids, train_token_type_ids, train_attention_mask, test_input_ids, test_token_type_ids, test_attention_mask, train_target, test_target = permutation_train_test_split(input_ids, token_type_ids, attention_mask, target)

#### 학습데이터셋

In [None]:
train_input_ids.shape, train_token_type_ids.shape, train_attention_mask.shape, train_target.shape

((17328, 512), (17328, 512), (17328, 512), (17328, 512))

#### 테스트데이터셋

In [None]:
test_input_ids.shape, test_token_type_ids.shape, test_attention_mask.shape, test_target.shape

((4332, 512), (4332, 512), (4332, 512), (4332, 512))



---



In [None]:
class BlogDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, token_type_ids, y=None):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.y = y

    def __len__(self):
        return self.input_ids.shape[0]

    def __getitem__(self, idx):
        item = {}
        item['input_ids'] = torch.tensor(self.input_ids[idx])
        item['attention_mask'] = torch.tensor(self.attention_mask[idx])
        item['token_type_ids'] = torch.tensor(self.token_type_ids[idx])
        if self.y is not None:
            item['y'] = torch.Tensor(self.y[idx])
        return item

In [None]:
dt = BlogDataset(train_input_ids, train_attention_mask, train_token_type_ids, train_target)
dl = torch.utils.data.DataLoader(dt, batch_size=2)
batch = next(iter(dl))
batch

{'input_ids': tensor([[    1, 41020,  4278,  ...,     0,     0,     0],
         [    1, 53071, 11159,  ...,  7273,  7052,     2]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'y': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.]])}

In [None]:
class Net(torch.nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.output_layer = torch.nn.Linear(self.model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        x = self.model(input_ids, attention_mask, token_type_ids)
        return self.output_layer(x[0])

In [None]:
model = Net(model_name)
pred = model(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])
pred.shape

torch.Size([2, 512, 1])

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    epoch_loss = 0
    model.train()
    for batch in tqdm(dataloader):
        pred = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['token_type_ids'].to(device))
        pred = pred.flatten(1)
        # padding 한 부분 제외
        mask = batch['attention_mask'].eq(1).to(device)
        target = torch.masked_select(batch['y'].to(device), mask)
        result = torch.masked_select(pred, mask)
        loss = loss_fn(result.view(-1, 1), target.view(-1, 1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    return epoch_loss

In [None]:
@torch.no_grad()
def test_loop(dataloader, model, loss_fn, device):
    model.eval()
    sig = torch.nn.Sigmoid()
    pred_list = []
    epoch_loss = 0
    for batch in dataloader:
        pred = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['token_type_ids'].to(device))
        pred = pred.flatten(1)

        # padding 한 부분 제외
        if batch.get('y') is not None:
            mask = batch['attention_mask'].eq(1).to(device)
            target = torch.masked_select(batch['y'].to(device), mask)
            result = torch.masked_select(pred, mask)
            loss = loss_fn(result.view(-1, 1), target.view(-1, 1))
            epoch_loss += loss.item()

        pred = sig(pred)
        pred = pred.to('cpu').numpy()
        pred_list.append(pred)

    pred = np.concatenate(pred_list)
    epoch_loss /= len(dataloader)
    return epoch_loss, pred

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import gc

batch_size = 8
loss_fn = torch.nn.BCEWithLogitsLoss()
device = "cuda" if torch.cuda.is_available() else "cpu"
epochs = 100
n_splits = 5
cv = KFold(n_splits=n_splits, random_state=SEED, shuffle=True)

In [None]:
is_holdout = False
reset_seeds(SEED)
best_f1_score_list = []
best_FP_avg_list = []
for i, (tri, vai) in enumerate(cv.split(train_input_ids)):
    model = Net(model_name).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    # 학습용
    train_dt = BlogDataset(train_input_ids[tri], train_attention_mask[tri], train_token_type_ids[tri], train_target[tri])
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

    # 검증용
    valid_dt = BlogDataset(train_input_ids[vai], train_attention_mask[vai], train_token_type_ids[vai], train_target[vai])
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size, shuffle=False)

    best_f1_score = 0
    best_fp_avg = 0
    patience = 0

    for epoch in tqdm(range(epochs)):
        train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
        valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)
        pred = (pred > 0.5).astype(int)
        # 정답데이터에서 1인 값만 mask
        mask = train_target[vai] == 1
        f1 = f1_score(train_target[vai][mask], pred[mask], average='micro')
        # FP
        fp_score = ((pred == 1) & (train_target[vai] == 0)).sum()
        # 평군 FP
        fp_avg = fp_score/train_target[vai].shape[0]

        patience += 1
        print(train_loss, valid_loss, f1, fp_score, fp_avg, sep="\t")
        if f1 > best_f1_score:
            patience = 0
            best_f1_score = f1
            best_fp_avg = fp_avg  # F1 score가 가장 높을 때의 FP 점수 기록
            torch.save(model.state_dict(), f"model_{i}.pth")

        if patience == 3:
            break

    print(f"{i} 번째 폴드 best F1_score: {best_f1_score}")
    print(f"{i} 번째 폴드 best FP: {best_fp_avg}")
    best_f1_score_list.append(best_f1_score)
    best_FP_avg_list.append(best_fp_avg)

    del train_dl, train_dt, valid_dl, valid_dt
    gc.collect()

    del optimizer, model
    torch.cuda.empty_cache()

    if is_holdout:
        break
print(np.mean(best_f1_score_list))
print(np.mean(best_FP_avg_list))

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/1733 [00:00<?, ?it/s]

0.10950806156978811	0.0875259608304995	0.8046519602240256	407687	117.62463935372188


  0%|          | 0/1733 [00:00<?, ?it/s]

0.07827890354901364	0.07961973204590758	0.7721339581666476	398884	115.08482400461627


  0%|          | 0/1733 [00:00<?, ?it/s]

0.0724165723527434	0.07754060912722816	0.7994970853811865	399973	115.39901904212348


  0%|          | 0/1733 [00:00<?, ?it/s]

0.06742073953005528	0.08097400299535232	0.8309863984455367	405449	116.97893825735719


  0%|          | 0/1733 [00:00<?, ?it/s]

0.06302722023377615	0.07863144019472709	0.7950851525888672	397715	114.74754760530871


  0%|          | 0/1733 [00:00<?, ?it/s]

0.05890547447938384	0.07901784110264981	0.8041833352383129	398956	115.10559723023658


  0%|          | 0/1733 [00:00<?, ?it/s]

0.05495623873188808	0.0796190240175768	0.8058178077494571	398908	115.09174841315638
0 번째 폴드 best F1_score: 0.8309863984455367
0 번째 폴드 best FP: 116.97893825735719


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/1733 [00:00<?, ?it/s]

0.11177601403649735	0.0849052572425472	0.7888979878334114	422483	121.8935372186959


  0%|          | 0/1733 [00:00<?, ?it/s]

0.0783442797550939	0.07785842440215536	0.7759943846513805	417149	120.35458742065782


  0%|          | 0/1733 [00:00<?, ?it/s]

0.07231044254990684	0.07845585358067317	0.8161675245671504	420740	121.39065204847086


  0%|          | 0/1733 [00:00<?, ?it/s]

0.06732510820469688	0.07712373278245399	0.8094174075807207	417772	120.53433352567801


  0%|          | 0/1733 [00:00<?, ?it/s]

0.06292050832262358	0.07912434487762401	0.8288839494618623	421012	121.46912867859204


  0%|          | 0/1733 [00:00<?, ?it/s]

0.05890164350090141	0.08190816954155947	0.8390617688348152	422095	121.78159261396422


  0%|          | 0/1733 [00:00<?, ?it/s]

0.05516848290974325	0.0814723981037942	0.8162962096396817	418224	120.66474321984997


  0%|          | 0/1733 [00:00<?, ?it/s]

0.05171474433878616	0.08226367602476739	0.8226017781937295	418449	120.72965954991345


  0%|          | 0/1733 [00:00<?, ?it/s]

0.04814576557840114	0.08932294746783609	0.834089845577913	421595	121.63733410271206
1 번째 폴드 best F1_score: 0.8390617688348152
1 번째 폴드 best FP: 121.78159261396422


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/1733 [00:00<?, ?it/s]

0.10744467824353555	0.08283207478851493	0.7509463501261802	406690	117.33698788228506


  0%|          | 0/1733 [00:00<?, ?it/s]

0.07836489472269531	0.07912492304868687	0.777070286942705	406419	117.25879976918638


  0%|          | 0/1733 [00:00<?, ?it/s]

0.07218977457167128	0.08001223146005572	0.8105664080755209	409317	118.09492210040392


  0%|          | 0/1733 [00:00<?, ?it/s]

0.06751558391954503	0.08098934373650957	0.8336293111505748	411422	118.70225043277553


  0%|          | 0/1733 [00:00<?, ?it/s]

0.06315537666137756	0.07981757916647443	0.8206608094214413	408489	117.85603000577034


  0%|          | 0/1733 [00:00<?, ?it/s]

0.05894026118107634	0.08529556964090618	0.845511262734835	413058	119.17426428159261


  0%|          | 0/1733 [00:00<?, ?it/s]

0.055165884606819655	0.08510489738653607	0.8149827086643612	407176	117.47720715522216


  0%|          | 0/1733 [00:00<?, ?it/s]

0.05176417292479965	0.08161779939870818	0.7935905224787363	404220	116.62435083669936


  0%|          | 0/1733 [00:00<?, ?it/s]

0.04851131077879471	0.08977716584788627	0.8026567903542389	405259	116.92412002308136
2 번째 폴드 best F1_score: 0.845511262734835
2 번째 폴드 best FP: 119.17426428159261


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/1733 [00:00<?, ?it/s]

0.10831422501271058	0.08534677104363518	0.7616014979368694	404915	116.85858585858585


  0%|          | 0/1733 [00:00<?, ?it/s]

0.07814770549067135	0.07896832938087152	0.7472693859152325	399145	115.19336219336219


  0%|          | 0/1733 [00:00<?, ?it/s]

0.07224554910318602	0.07678432174215806	0.7740843051815208	400906	115.7015873015873


  0%|          | 0/1733 [00:00<?, ?it/s]

0.0672839956161385	0.08011193410815319	0.816179105167651	405095	116.91053391053391


  0%|          | 0/1733 [00:00<?, ?it/s]

0.06303841609183207	0.07994760628274646	0.8211028791363746	404626	116.77518037518037


  0%|          | 0/1733 [00:00<?, ?it/s]

0.05913798954387277	0.07882261623023293	0.7791005443890937	398738	115.07590187590188


  0%|          | 0/1733 [00:00<?, ?it/s]

0.05509692595479499	0.08553029142302988	0.8390873680925577	406839	117.41385281385281


  0%|          | 0/1733 [00:00<?, ?it/s]

0.05166444045142741	0.08299425770507156	0.8028063199990754	401566	115.89206349206349


  0%|          | 0/1733 [00:00<?, ?it/s]

0.04841697731888834	0.08984547822998004	0.8069903720570049	403330	116.4011544011544


  0%|          | 0/1733 [00:00<?, ?it/s]

0.04552747560552156	0.08892573965155447	0.7881274633317538	399765	115.37229437229438
3 번째 폴드 best F1_score: 0.8390873680925577
3 번째 폴드 best FP: 117.41385281385281


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/1733 [00:00<?, ?it/s]

0.10822022132966742	0.08145901202846507	0.7377188940092165	23951	6.912265512265512


  0%|          | 0/1733 [00:00<?, ?it/s]

0.07827087854176536	0.07882122656819733	0.8065552995391705	28976	8.362481962481963


  0%|          | 0/1733 [00:00<?, ?it/s]

0.07224474849036437	0.07924895813422543	0.8269239631336406	30361	8.762193362193361


  0%|          | 0/1733 [00:00<?, ?it/s]

0.06741683451115947	0.07726096658868724	0.8129377880184332	27182	7.844733044733045


  0%|          | 0/1733 [00:00<?, ?it/s]

0.06291419548394703	0.07879207175104849	0.8247695852534562	27939	8.063203463203463


  0%|          | 0/1733 [00:00<?, ?it/s]

0.059001926806703235	0.08439713504920769	0.8444700460829493	32047	9.248773448773449


  0%|          | 0/1733 [00:00<?, ?it/s]

0.05479705451508856	0.08269071292715825	0.8332834101382488	28832	8.320923520923522


  0%|          | 0/1733 [00:00<?, ?it/s]

0.0514331194714563	0.08862804916305339	0.843721198156682	30948	8.93160173160173


  0%|          | 0/1733 [00:00<?, ?it/s]

0.0481884076829142	0.09172902277494836	0.838294930875576	30594	8.829437229437229
4 번째 폴드 best F1_score: 0.8444700460829493
4 번째 폴드 best FP: 9.248773448773449
0.8398233688381387
96.91948428310806


#### 추론하기

In [None]:
test_dt = BlogDataset(test_input_ids, test_attention_mask, test_token_type_ids)
test_dl = torch.utils.data.DataLoader(test_dt, batch_size=batch_size, shuffle = False)

pred_list = []
for i in range(n_splits):
    model = Net(model_name).to(device)
    state_dict = torch.load(f"/content/drive/MyDrive/Python/semi_project/model_{i}.pth") # 가중치 불러오기
    model.load_state_dict(state_dict) # 모델에 가중치 세팅

    _, pred = test_loop(test_dl, model, loss_fn, device)
    pred_list.append(pred)

pred = np.mean(pred_list, axis=0)
pred = (pred > 0.5).astype(int)

In [None]:
len(pred_list)

5

In [None]:
pred_arr.shape

(5, 4332, 512)

In [None]:
pred_arr = np.array(pred_list)
pred_df_1 = pd.DataFrame(pred_arr[0])
pred_df_2 = pd.DataFrame(pred_arr[1])
pred_df_3 = pd.DataFrame(pred_arr[2])
pred_df_4 = pd.DataFrame(pred_arr[3])
pred_df_5 = pd.DataFrame(pred_arr[4])

In [None]:
pred_df_1.to_csv('/content/drive/MyDrive/Python/semi_project/pred_1.csv', index=False)
pred_df_2.to_csv('/content/drive/MyDrive/Python/semi_project/pred_2.csv', index=False)
pred_df_3.to_csv('/content/drive/MyDrive/Python/semi_project/pred_3.csv', index=False)
pred_df_4.to_csv('/content/drive/MyDrive/Python/semi_project/pred_4.csv', index=False)
pred_df_5.to_csv('/content/drive/MyDrive/Python/semi_project/pred_5.csv', index=False)

#### f1_score, FP 계산

In [None]:
mask = test_target == 1
f1 = f1_score(test_target[mask], pred[mask], average='micro')
fp_score = ((pred == 1) & (test_target == 0)).sum()
fp_avg = fp_score / test_target.shape[0]
print(f"F1_SCORE : {f1}")
print(f"FP_AVG : {fp_avg}")

F1_SCORE : 0.8378436667791905
FP_AVG : 118.0039242843952


#### 추론한 토큰 역 토큰화

In [None]:
def convert_to_tokens(tokenizer, test_input_ids, pred):
    tag_tokens = []
    for i in range(len(pred)): # pred 값이 1인 부분만 추출
        tag_token = tokenizer.convert_ids_to_tokens(test_input_ids[i][pred[i] == 1])
        tag_tokens.append(tag_token)
    return tag_tokens

In [None]:
# 후에 토큰들 연결 시키는 작업 필요
tag_tokens = convert_to_tokens(tokenizer, test_input_ids, pred)

In [None]:
tag_tokens[1]

['비건',
 '감자',
 '인슐린',
 '비건',
 '감자',
 '인슐린',
 '저항성',
 '감자',
 '인슐린',
 '저항성',
 '감자',
 '인슐린',
 '저항성',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '