## gpu 확인 ##

```
사용환경
우분투 맥 윈도우
cuda,cudnn 설치
파이썬 3.7~3.10, 아나콘다 가상환경
```



In [None]:
!nvidia-smi

Sun Nov 19 12:23:57 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    54W / 400W |   1863MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Setup 라이브러리 불러오기

In [None]:
!pip install -q peft transformers datasets evaluate
!pip install wandb



In [None]:
!pip install einops



In [None]:
# 모델
from transformers import (
    AutoModelForSequenceClassification,
    RobertaConfig, RobertaModel, RobertaTokenizer,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoModelForCausalLM
)
# 파인튜닝
from peft import (
    LoraConfig,
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PromptEncoderConfig,
)
# 파이토치 데이터셋
from datasets import load_dataset
import evaluate
import torch
import os

model_name_or_path = "klue/roberta-small"
# model_name_or_path = "Hyeonseo/ko_roberta_small_model" #token classification #나쁘지않음
# model_name_or_path = "kyujinpy/KO-Platypus2-7B-ex"
# model_name_or_path = "xlm-roberta-base" #나쁘지않음

# model_name_or_path = "EleutherAI/polyglot-ko-1.3b"
# model_name_or_path = "google/mt5-base" # slow tokenizer
# model_name_or_path = "beomi/llama-2-ko-7b"
# model_name_or_path = "microsoft/phi-1_5"






## Dataset


```
로컬 환경 시 해당 경로 사용
```



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd "/content/drive/MyDrive/dataset"

/content/drive/MyDrive/dataset


In [None]:
import pandas as pd
df = pd.read_csv("./review_tags.csv", encoding="UTF-8", index_col=False)
id = pd.read_csv("./review_tag_types.csv", encoding="UTF-8", index_col=False)
df = df.loc[:,{'brand_code','review_id','message','review_tag_type_id'}]

df = df[df['brand_code'] == "topten10mall.com"]

  df = df.loc[:,{'brand_code','review_id','message','review_tag_type_id'}]


In [None]:
id = id[id['visible']==1]
id = id[id['nlp_category_type_id']==2]
id = id.loc[:,{'id','name'}]
df, id

  id = id.loc[:,{'id','name'}]


(         review_id        brand_code  review_tag_type_id  \
 0           838412  topten10mall.com                  28   
 1           838412  topten10mall.com                  18   
 2           838412  topten10mall.com                  25   
 3           838412  topten10mall.com                  30   
 4           865458  topten10mall.com                  23   
 ...            ...               ...                 ...   
 3539497    2047424  topten10mall.com                  23   
 3539498    2047425  topten10mall.com                  28   
 3539499    2047426  topten10mall.com                  22   
 3539500    2047426  topten10mall.com                  28   
 3539501    2047426  topten10mall.com                  19   
 
                                                    message  
 0        한달 입어본 후 리뷰\n사이즈가 34보다 크게 나와서\n허리끈 졸라 매지 않으면 밖...  
 1        한달 입어본 후 리뷰\n사이즈가 34보다 크게 나와서\n허리끈 졸라 매지 않으면 밖...  
 2        한달 입어본 후 리뷰\n사이즈가 34보다 크게 나와서\n허리끈 졸라 매지 않으면 밖...  
 3        한달 입어본 후

In [None]:
id = id.rename(columns={'id':'review_tag_type_id' , 'name' : 'keyword'})
new_df = pd.merge(df, id, on = "review_tag_type_id")
new_df = new_df.drop({'review_tag_type_id','brand_code'},axis = 1)

new_df.isna().sum()

review_id    0
message      0
keyword      0
dtype: int64

In [None]:
new_df = new_df[new_df['message'].notna() ]

In [None]:
new_df = new_df.sort_values(by ='review_id').reset_index(drop = True)

## 전처리

In [None]:
new_df = new_df.iloc[0:100000,:].drop('review_id',axis = 1)

new_df['fil'] = new_df['message']+new_df['keyword']



In [None]:
new_df = new_df.drop_duplicates(['fil'], keep='first')
new_df= new_df.reset_index(drop= True)

new_df = new_df.drop('fil',axis = 1)

In [None]:
leng = len(new_df)
leng80 = int(round(leng * 0.8,0))

train_df = new_df.iloc[0:leng80,:]
test_df = new_df.iloc[leng80+1:leng,:]


In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)
# fit_transform은 train에만 사용하고 test에는 학습된 인코더에 fit만 해야한다
train_cat = ohe.fit_transform(train_df[['keyword']])
train_cat



array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [None]:
ohe.categories_

[array(['가격', '기능성', '길이', '디자인', '라인(핏)', '마감처리', '배송', '사이즈', '색상', '소재',
        '스타일', '신축성', '착용감', '품질'], dtype=object)]

In [None]:
train_df1=pd.concat([train_df.drop(columns=['keyword']),
           pd.DataFrame(train_cat, columns=[col for col in ohe.categories_[0]])], axis=1)
train_df1

Unnamed: 0,message,가격,기능성,길이,디자인,라인(핏),마감처리,배송,사이즈,색상,소재,스타일,신축성,착용감,품질
0,핏 자체도 이쁘지만 색이 너무 잘 바졌어요 ㅎ 강추합니다,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,핏 자체도 이쁘지만 색이 너무 잘 바졌어요 ㅎ 강추합니다,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,"편하고, 생각했던 재질 그대로예요!!\n근데 생각했던 것 보다 허리 둘레가 좀 커서...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"편하고, 생각했던 재질 그대로예요!!\n근데 생각했던 것 보다 허리 둘레가 좀 커서...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,"편하고, 생각했던 재질 그대로예요!!\n근데 생각했던 것 보다 허리 둘레가 좀 커서...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60937,정말 바지가 너무나도 편하네요. 득탬이란 이러것이 아닐까 하는 생각이 들 정도 입니...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60938,화면으로 보는거보다 좀 두께감이 있구요 길이는 적당합니다 여름에 입기에 가격대비 괜...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
60939,화면으로 보는거보다 좀 두께감이 있구요 길이는 적당합니다 여름에 입기에 가격대비 괜...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60940,화면으로 보는거보다 좀 두께감이 있구요 길이는 적당합니다 여름에 입기에 가격대비 괜...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
train_df1 = train_df1.groupby('message',as_index=False).sum()
train_df1

Unnamed: 0,message,가격,기능성,길이,디자인,라인(핏),마감처리,배송,사이즈,색상,소재,스타일,신축성,착용감,품질
0,\n\n\n깔끔한 티 맘에 들어요 \n소재도 오가닉이라 더 믿음가구용\n요즘 보세티...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,\n\n\n색상 디자인 좋아요 소재도 시원하고\n가슴 부분 자수 멋지네요\n색상별로...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,\n\n\n재질이 좋아여\n짱짱해보이고\n다만 사이즈 미스때문에\n다시 시켜야할거같...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
3,\n\n\n할인해서 싼가격에 샀지만 가격대비 품질 우수하고 이쁩니다\n사이즈는 한치...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,\n\n\n화이트 블랙삿는데 예쁩니다 좀 얇은감이 살짝있고 시원하게입을수있을것같아요...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20025,"흰티에 파랑색 그래픽 문자가 매우 잘 어울립니다. 시원하게 여름 나는 법, 슬럽 크...",0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
20026,히든밴딩 슬림핏 슬랙스\n재질도 고급스럽고 핏도 좋습니다\n색감도 은은하니 좋구요\...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
20027,힌색 린넨 자켓이랑 입으려고 색깔 맞춰 샀어요~~근데 바지가 저에겐 좀 짧네요ㅜㅜ ...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20028,힙이 조금 있는 편이라서 엉덩이 부분은 타이트하고 허리 부분은 잘맞습니다. 슬림핏이...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
test_df = test_df.reset_index(drop = True)
test_df

Unnamed: 0,message,keyword
0,얇고 부드러운 촉감의 소재임에도 바침이 적은편입니다. 포인트로 가슴쪽에 와펜이 있는...,디자인
1,얇고 부드러운 촉감의 소재임에도 바침이 적은편입니다. 포인트로 가슴쪽에 와펜이 있는...,착용감
2,요건 딱 맞아요. 위에 티는 아기옷 120 입어서 110 사서 넉넉하던데 바지는 좀...,사이즈
3,요건 딱 맞아요. 위에 티는 아기옷 120 입어서 110 사서 넉넉하던데 바지는 좀...,품질
4,살짝 루즈핏이라 정사이즈 시키면 딱\n이쁘게 잘 맞습니다.\n원단 너무 두껍지도 얇...,기능성
...,...,...
15229,잘 맞아요\n통이 살짝 큰 편이긴해요\n시원하게 잘 입을 수 있을듯 합니다.\n여름...,라인(핏)
15230,잘 맞아요\n통이 살짝 큰 편이긴해요\n시원하게 잘 입을 수 있을듯 합니다.\n여름...,기능성
15231,사이즈는 잘 맞으나 올이 좀잘 망가지는ㅈ것 같아보여요\n색깔도 고급스러운 느낌이긴합...,색상
15232,사이즈는 잘 맞으나 올이 좀잘 망가지는ㅈ것 같아보여요\n색깔도 고급스러운 느낌이긴합...,품질


In [None]:

test_cat = ohe.fit_transform(test_df[['keyword']])
test_cat



array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
test_df1=pd.concat([test_df.drop(columns=['keyword']),
           pd.DataFrame(test_cat, columns=[col for col in ohe.categories_[0]])], axis=1)
test_df1

Unnamed: 0,message,가격,기능성,길이,디자인,라인(핏),마감처리,배송,사이즈,색상,소재,스타일,신축성,착용감,품질
0,얇고 부드러운 촉감의 소재임에도 바침이 적은편입니다. 포인트로 가슴쪽에 와펜이 있는...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,얇고 부드러운 촉감의 소재임에도 바침이 적은편입니다. 포인트로 가슴쪽에 와펜이 있는...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,요건 딱 맞아요. 위에 티는 아기옷 120 입어서 110 사서 넉넉하던데 바지는 좀...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,요건 딱 맞아요. 위에 티는 아기옷 120 입어서 110 사서 넉넉하던데 바지는 좀...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,살짝 루즈핏이라 정사이즈 시키면 딱\n이쁘게 잘 맞습니다.\n원단 너무 두껍지도 얇...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15229,잘 맞아요\n통이 살짝 큰 편이긴해요\n시원하게 잘 입을 수 있을듯 합니다.\n여름...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15230,잘 맞아요\n통이 살짝 큰 편이긴해요\n시원하게 잘 입을 수 있을듯 합니다.\n여름...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15231,사이즈는 잘 맞으나 올이 좀잘 망가지는ㅈ것 같아보여요\n색깔도 고급스러운 느낌이긴합...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
15232,사이즈는 잘 맞으나 올이 좀잘 망가지는ㅈ것 같아보여요\n색깔도 고급스러운 느낌이긴합...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
test_df1 = test_df1.groupby('message',as_index=False).sum()
test_df1

Unnamed: 0,message,가격,기능성,길이,디자인,라인(핏),마감처리,배송,사이즈,색상,소재,스타일,신축성,착용감,품질
0,\nL사이즈가 성인 남자가 맞음\n나중에 늘어날거 같긴한데 .....\nm사이즈는\...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,\nL사이즈가 성인 남자가 맞음\n나중에 늘어날거 같긴한데 .....\nm사이즈는\...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,\nvery good! \n가성비 최고 재질도 맘에 들고 넘 편하게 잘 입고 있습니...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,\n가성비 좋고 재질도 좋고\n스타일도 좋고 다 만족스럽습니다!!!!!!!\n\n한...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,\n고민하다가 구매한건데\n만족스럽긴합니다\n추천해주는 사이즈로 구매했는데\n거의맞...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4869,흰티속에 입어도 안 비치니 좋네요\n쿨이라 시원하구요!\n근데 금방 느ㄹ어날 거 같...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4870,힁티에 노랑 레터링.디자인.시원해 보여서 좋네요\n얇아서 받쳐입믄 용으로도 좋고요 ...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4871,힙사이즈가 큰 편이라 일부러 크게 구매햤는데도 \n작아요 ㅠㅠㅠㅠㅠ \n그냥 입다보...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4872,힙색이라고 해서 구매했는데 생각보다 사이즈가 좀 크네요. 물건은 이것 저것 많이 들...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
target_list = ['가격', '기능성', '길이', '디자인', '라인(핏)', '마감처리', '배송', '사이즈', '색상', '소재', '스타일', '신축성', '착용감', '품질']

In [None]:
MAX_LEN = 512 #this model allow 512 token size
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 1e-05


In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['message']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [None]:
train_dataset = CustomDataset(train_df1, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df1, tokenizer, MAX_LEN)

In [None]:
test_dataset[0]

{'input_ids': tensor([    0,    48, 12421,  2228,  2116,  6025,  3997,  2116,  1047,  2053,
          5294,  2170,  8518,  2180,   555,  2411,  2470,  2147,    18,    18,
            18,    18,    18,    80, 12421,  2228,  2259,  3811,    16,  3651,
          2031,  2170,  2318,  1521,  1047,  2069,  2180,   555,  2053,  5695,
          2116,  3790,  2170,  2259,  7321,  2578,  2289,  6791,  1199,  2227,
          2417,  2088,  4746,  8296,  3746,  2067,  3704,  2194,  2287,  2289,
             2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [None]:

train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)


In [None]:
test_data_loader

<torch.utils.data.dataloader.DataLoader at 0x793fad01efe0>

## 모델

In [None]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [None]:
import shutil
import numpy as np

In [None]:

def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = RobertaModel.from_pretrained(model_name_or_path, return_dict=True)
        self.dropout = torch.nn.Dropout(0.2)
        self.linear = torch.nn.Linear(768, 14)  #using 5 because we have 5 label classes

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTClass(
  (bert_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [None]:
#loss function using BCEwithlogistsloss
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:

val_targets=[]
val_outputs=[]

In [None]:
# model.state_dict()
# new_state_dict = {key: value for key, value in model.state_dict().items() if key != 'linear.weight' and key != 'linear.bias'}
# # 새로운 딕셔너리 출력
# print(new_state_dict)

In [None]:
def train_model(n_epochs, training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path):

  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf


  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)

    print('############# Epoch {}: Training End     #############'.format(epoch))

    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################
    # validate the model #
    ######################

    model.eval()

    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch,
            train_loss,
            valid_loss
            ))

      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }

        # save checkpoint
      save_ckp(checkpoint, True, checkpoint_path, best_model_path)
      new_state_dict = {key: value for key, value in model.state_dict().items()}    # if key != 'linear.weight' and key != 'linear.bias'}
      torch.save(new_state_dict, '/content/drive/MyDrive/best_model_weights.pt')
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        torch.save(new_state_dict, '/content/drive/MyDrive/best_model_weights.pt')
        valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))

  return model


In [None]:
ckpt_path = "/content/drive/MyDrive/curr_ckpt"
best_model_path = "/content/drive/MyDrive/best_model.pt"

In [None]:
trained_model = train_model(EPOCHS, train_data_loader, test_data_loader, model, optimizer, ckpt_path, best_model_path)


############# Epoch 1: Training Start   #############
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.000442 	Average Validation Loss: 0.001619
Validation loss decreased (inf --> 0.001619).  Saving model ...
############# Epoch 1  Done   #############

############# Epoch 2: Training Start   #############
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############
############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.000363 	Average Validation Loss: 0.001435
Validation loss decreased (0.001619 --> 0.001435).  Saving model ...
############# Epoch 2  Done   #############

############# Epoch 3: Training Start   #############
############# Epoch 3: Training End     #############
############# Epoch 3: Validation Start   #############
############# Epo

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import classification_report


In [None]:

trained_model.eval()

BERTClass(
  (bert_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [None]:
def validation(epoch):
    # model.eval()
    fin_targets=[]
    fin_outputs=[]
    softm = torch.nn.Softmax(dim=1)
    with torch.no_grad():
        for _, data in enumerate(test_data_loader, 0): # test데이터로더명

          ids = data['input_ids'].to(device, dtype = torch.long)
          mask = data['attention_mask'].to(device, dtype = torch.long)
          token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
          targets = data['targets'].to(device, dtype = torch.float)
          # outputs = model(ids.squeeze(), mask.squeeze(), token_type_ids.squeeze())
          # outputs = model((ids.squeeze(), mask.squeeze(), token_type_ids.squeeze()))
          outputs = trained_model(ids, mask, token_type_ids)
          # fin_targets.extend(targets.cpu().detach().numpy().tolist())
          fin_targets.extend(targets.cpu().detach().numpy().tolist())

          fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets


In [None]:
outputs, targets = validation(1)

In [None]:
outputs = [[round(outputs[x][i]) for i in range(0,14)] for x in range(0,len(outputs))]
len(outputs), len(targets)

(4874, 4874)

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

pprint.pprint(classification_report(targets,np.array(outputs,dtype=np.int)))

('              precision    recall  f1-score   support\n'
 '\n'
 '           0       0.97      0.99      0.98      1545\n'
 '           1       0.96      0.98      0.97      1313\n'
 '           2       0.94      0.96      0.95       430\n'
 '           3       0.92      0.88      0.90      1131\n'
 '           4       0.89      0.89      0.89      1271\n'
 '           5       0.95      0.36      0.53        55\n'
 '           6       0.98      0.98      0.98       364\n'
 '           7       0.96      0.97      0.97      2117\n'
 '           8       0.97      0.97      0.97      1269\n'
 '           9       0.95      0.95      0.95      2295\n'
 '          10       0.83      0.87      0.85       767\n'
 '          11       0.93      0.93      0.93       256\n'
 '          12       0.92      0.94      0.93      1517\n'
 '          13       0.88      0.87      0.88       904\n'
 '\n'
 '   micro avg       0.94      0.94      0.94     15234\n'
 '   macro avg       0.93      0.90      0.9

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  pprint.pprint(classification_report(targets,np.array(outputs,dtype=np.int)))
  _warn_prf(average, modifier, msg_start, len(result))


## 업로드

In [None]:
cd /content/drive/MyDrive/

/content/drive/MyDrive


In [None]:
model_name_or_path = 'klue/roberta-small'
config = RobertaConfig.from_pretrained(model_name_or_path)
model = RobertaModel(config=config)
print(model)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-5): 6 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout)

In [None]:
import torch
from transformers import RobertaModel, RobertaConfig

model_name = 'klue/roberta-small'

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = RobertaModel.from_pretrained(model_name, return_dict=True)
        self.dropout = torch.nn.Dropout(0.2)
        self.linear = torch.nn.Linear(768, 14)  #using 5 because we have 5 label classes

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

    def push(self,path):
        self.bert_model.push_to_hub(path,  create_pr=1, use_auth_token = True)




model = BERTClass()
# 모델 가중치 불러오기
# model = RobertaModel.from_pretrained(model_name_or_path, return_dict=True)
model_dict = torch.load('./best_model_weights.pt')

# print(model_dict)
# model에 새로운 가중치 로드
model.load_state_dict(model_dict)
model


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTClass(
  (bert_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push("dongseon/cllama_keyword")



model.safetensors:   0%|          | 0.00/272M [00:00<?, ?B/s]

## test

In [None]:
# key = "가격"
# txt = "단추가 짱짱해요 "
# txt = "화면에서는 하늘색인데 진한 파랑색입니다.. 근데 세탁 후 보풀이 생겼어요"
txt = "배송이 너무너무 늦어요, 그래도 가격은 저렴하게 샀어요"
# txt = "생각보다 더 따뜻하고 프린팅도 귀엽고 색감도 이쁘고 잘산것 같아요"
# txt = "흰옷이라서 속이 좀 비치지만 몸에 딱 맞아요"
# txt = "배송이 최악이애요"

inputs = tokenizer(txt

        # f'### review: {txt} ### keyword: {key} '

    , return_tensors="pt")
# del inputs['attention_mask']

In [None]:
inputs


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [None]:
# softm = torch.nn.Softmax(dim=1)
fin_outputs=[]
with torch.no_grad():
     # 모델이 있는 디바이스로 입력 데이터 이동
    inputs = {key: value.to(device) for key, value in inputs.items()}
    # 모델 호출
    ids = inputs['input_ids'].to(device, dtype = torch.long)
    mask = inputs['attention_mask'].to(device, dtype = torch.long)
    token_type_ids = inputs['token_type_ids'].to(device, dtype = torch.long)
    outputs = trained_model(ids, mask, token_type_ids)
    fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())


In [None]:
fin_outputs

[[0.9691582322120667,
  0.005610233172774315,
  0.006224624812602997,
  0.011245891451835632,
  0.008015989325940609,
  0.010196688584983349,
  0.9878334999084473,
  0.014983088709414005,
  0.014843413606286049,
  0.011129903607070446,
  0.007920484058558941,
  0.003978283144533634,
  0.006165207363665104,
  0.03101613186299801]]

In [None]:
target_list

['가격',
 '기능성',
 '길이',
 '디자인',
 '라인(핏)',
 '마감처리',
 '배송',
 '사이즈',
 '색상',
 '소재',
 '스타일',
 '신축성',
 '착용감',
 '품질']