# **Basic Setting**

#### **Google Drive Mount**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **1. COMET-atomic-2020**


#### **Install and Import**

In [None]:
!pip install transformers

In [None]:
import os
import json
import torch
import argparse

import pandas as pd

from tqdm import tqdm
from pathlib import Path
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

#### **Device Setting**

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
!nvidia-smi

Tue Nov 15 03:44:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8    10W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#### **Install COMET-ATOMIC2020 with BART**

In [None]:
!git clone https://github.com/allenai/comet-atomic-2020.git

Cloning into 'comet-atomic-2020'...
remote: Enumerating objects: 190, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 190 (delta 56), reused 42 (delta 39), pack-reused 113[K
Receiving objects: 100% (190/190), 7.15 MiB | 6.40 MiB/s, done.
Resolving deltas: 100% (74/74), done.


In [None]:
%cd /content/comet-atomic-2020/models/comet_atomic2020_bart
!pip install -r requirements.txt
!bash download_model.sh

In [None]:
!python ./generation_example.py

## **파일 내용 수정**
- 모델을 불러오는 방법을 알아내기 힘들어서 불가피하게 아래 내용을 복사합니다.
- 'generate_example.py' 에 붙여 넣으시면 됩니다.

In [None]:
"""
import os
import json
import torch
import argparse

import pandas as pd

from tqdm import tqdm
from pathlib import Path
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from utils import calculate_rouge, use_task_specific_params, calculate_bleu_score, trim_batch


def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]


class Comet:
    def __init__(self, model_path):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        task = "summarization"
        use_task_specific_params(self.model, task)
        self.batch_size = 1
        self.decoder_start_token_id = None

    def generate(
            self, 
            queries,
            decode_method="beam", 
            num_generate=5, 
            ):

        with torch.no_grad():
            examples = queries

            decs = []
            for batch in list(chunks(examples, self.batch_size)):

                batch = self.tokenizer(batch, return_tensors="pt", truncation=True, padding="max_length").to(self.device)
                input_ids, attention_mask = trim_batch(**batch, pad_token_id=self.tokenizer.pad_token_id)

                summaries = self.model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    decoder_start_token_id=self.decoder_start_token_id,
                    num_beams=num_generate,
                    num_return_sequences=num_generate,
                    )

                dec = self.tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
                decs.append(dec)

            return decs


all_relations = [
    "AtLocation",
    "CapableOf",
    "Causes",
    "CausesDesire",
    "CreatedBy",
    "DefinedAs",
    "DesireOf",
    "Desires",
    "HasA",
    "HasFirstSubevent",
    "HasLastSubevent",
    "HasPainCharacter",
    "HasPainIntensity",
    "HasPrerequisite",
    "HasProperty",
    "HasSubEvent",
    "HasSubevent",
    "HinderedBy",
    "InheritsFrom",
    "InstanceOf",
    "IsA",
    "LocatedNear",
    "LocationOfAction",
    "MadeOf",
    "MadeUpOf",
    "MotivatedByGoal",
    "NotCapableOf",
    "NotDesires",
    "NotHasA",
    "NotHasProperty",
    "NotIsA",
    "NotMadeOf",
    "ObjectUse",
    "PartOf",
    "ReceivesAction",
    "RelatedTo",
    "SymbolOf",
    "UsedFor",
    "isAfter",
    "isBefore",
    "isFilledBy",
    "oEffect",
    "oReact",
    "oWant",
    "xAttr",
    "xEffect",
    "xIntent",
    "xNeed",
    "xReact",
    "xReason",
    "xWant",
    ]

if __name__ == "__main__":
    # sample usage (reproducing AAAI)
    print("model loading ...")
    comet = Comet("./comet-atomic_2020_BART_aaai")
    comet.model.zero_grad()
    print("model loaded")

    print("data loading ...")
    data_path = '/content/drive/MyDrive/dacon_sentiment_analysis/dataset'
    data_csv = pd.read_csv(os.path.join(data_path, 'train.csv'))

    print('data loaded')

    queries = []

    for rel in tqdm(all_relations, desc=f'Data Preprocessing with COMET'):
        print(f'\n---- {rel} processing ----\n')
        queries = []

        for i, row in tqdm(data_csv.iterrows()):
            query = '{} {}'.format(row['Utterance'], rel)
            queries.append(query)

        results = comet.generate(queries, decode_method='greedy', num_generate=1)
        results = [r[0] for r in results]

        data_csv[rel] = results

        print(f'\n---- {rel} completed ----\n')

    data_csv.to_csv('./comet_train.csv', index=False)
"""

# **2. For CoMPM**

In [8]:
import os

import pandas as pd
from tqdm.auto import tqdm

In [6]:
data_path = '/content/drive/MyDrive/dacon_sentiment_analysis/dataset'

In [7]:
train_csv = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_csv = pd.read_csv(os.path.join(data_path, 'test.csv'))

In [9]:
train_csv.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,My duties? All right.,Chandler,0,surprise


In [10]:
test_csv.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID
0,TEST_0000,Why do all the coffee cups have figures below?,Mark,0
1,TEST_0001,"Oh. It's so Monica can follow. Of this way, if...",Rachell,0
2,TEST_0002,You know what?,Rachell,0
3,TEST_0003,"Come on, Lydia, you can do it.",Joeyy,1
4,TEST_0004,To push!,Joeyy,1


In [19]:
def df2compm_form(df, file_name):
  with open(os.path.join(data_path, file_name), 'w') as f:
    past_dialogue_id = 0
    for i, row in tqdm(df.iterrows()):
      line = list()

      if past_dialogue_id != row['Dialogue_ID']:
        f.write('\n')
        past_dialogue_id += 1

      line.append(row['Speaker'])
      line.append(row['Utterance'])

      try:
        line.append(row['Target'])

      except: # for test dataset
        line.append('neutral')
      
      f.write('\t'.join(line) + '\n')

In [20]:
df2compm_form(train_csv, 'compm_train.txt')

0it [00:00, ?it/s]

In [21]:
df2compm_form(test_csv, 'compm_test.txt')

0it [00:00, ?it/s]