In [1]:
import random
import numpy as np
import torch
import transformers
import pandas as pd

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

def now():
    current_directory = os.getcwd()
    print("현재 작업 디렉토리:", current_directory)

now()

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    transformers.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    # GPU seed 고정
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        
    # PyTorch 재현성 설정 (CUDNN)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(42)
    

# 시드를 고정할 값 설정
seed = 42
set_seed(seed)


현재 작업 디렉토리: /data1/home/gyubin/EasyEdit


In [2]:
import pandas as pd
df = pd.read_excel("preprocessed_df2.xlsx")
df.head()
# 조건 정의
condition = (df['sbj_hop_num'] == 0) | (df['obj_true_hop_num'] == 0) | (df['obj_new_hop_num'] == 0)
# 조건을 만족하는 행 제거
df_filtered = df[~condition]

df = df_filtered.reset_index(drop = True)


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "meta-llama/Llama-3.1-8B-Instruct"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir = "../.cache")
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side='left'
model = AutoModelForCausalLM.from_pretrained(model_name,cache_dir = "../.cache").to('cuda')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
from tqdm import tqdm

model.eval()

for i in tqdm(range(len(df))):
    subject = df.loc[i, 'subject']
    sbj_hops = df.loc[i, 'sbj_one_hop']
    
    for word in sbj_hops.split(','):
        
        # 확률 계산할 문장
        sentence = f"{subject} and {word}"
        # 토큰화
        inputs = tokenizer(sentence, return_tensors="pt").to('cuda')
        # 모델 출력 (logits)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
        # 소프트맥스: 확률 계산
        probs = torch.nn.functional.softmax(logits, dim=-1)
        # 각 토큰의 로그 확률 계산
        input_ids = inputs['input_ids']
        log_probs = torch.log(probs[0, torch.arange(input_ids.size(-1)), input_ids[0]])
        # 전체 문장의 로그 확률 계산
        total_log_prob = log_probs.sum().item()
        # 결과 출력
        #print(f"Log probability of the sentence '{sentence}': {total_log_prob}")


  0%|          | 34/21782 [00:27<4:54:46,  1.23it/s] 


KeyboardInterrupt: 

In [16]:
df.head()

Unnamed: 0,index,subject,subject_id,sbj_one_hop,sbj_hop_num,obj_true,obj_true_id,obj_one_hop,obj_true_hop_num,obj_new,obj_new_id,obj_new_one_hop,obj_new_hop_num,view
0,0,Danielle Darrieux,Q234149,"Georges Mitsinkides,stage actor,singer,Henri D...",21,French,Q150,"Madagascar,synthetic language,Canada,Haiti,Cam...",99,English,Q1860,"voiced alveolar approximant,Grenada,Ethiopia,S...",175,222931
1,1,Edwin of Northumbria,Q348955,"English,saint,Hatfield Chase,Whitby Abbey,Germ...",12,Christianity,Q5043,"Bible,Christentum,Jesus,christentum,Small Broc...",25,Islam,Q432,"hadith,Marifa,idea,Ancient Semitic religion,Ne...",76,177019
2,2,Toko Yasuda,Q7813654,"Japan,Touch and Go Records,bass guitar,keyboar...",13,guitar,Q6607,"Instrument of the Year,neck,fretboard,mahogany...",32,piano,Q5994,"Bartolomeo Cristofori,kpf,Metropolitan Museum ...",11,74330
3,3,Autonomous University of Madrid,Q788091,European Alliance for Social Sciences and Huma...,9,Spain,Q29,"Cortes Generales,Extremaduran,spanyol,African ...",231,Sweden,Q34,"Scandinavian studies,svensk,African Developmen...",212,87615
4,4,Lyon,Q456,"International Cities of Refuge Network,Aleppo,...",47,Beirut,Q3820,"Athens,Yerevan,Rmeil,Bachoura,Mousseitbeh,Smal...",51,Manila,Q1461,"Navotas,Panama City,Cartagena de Indias,Ho Chi...",93,2376138


In [None]:
for i in range(len(df)):
    if df.loc[i,'subject'] == 'Tim Cook':
        print(i)

7214


In [5]:
df.head()

Unnamed: 0,index,subject,subject_id,sbj_one_hop,sbj_hop_num,obj_true,obj_true_id,obj_one_hop,obj_true_hop_num,obj_new,obj_new_id,obj_new_one_hop,obj_new_hop_num,view
0,0,Danielle Darrieux,Q234149,"Georges Mitsinkides,stage actor,singer,Henri D...",21,French,Q150,"Madagascar,synthetic language,Canada,Haiti,Cam...",99,English,Q1860,"voiced alveolar approximant,Grenada,Ethiopia,S...",175,222931
1,1,Edwin of Northumbria,Q348955,"English,saint,Hatfield Chase,Whitby Abbey,Germ...",12,Christianity,Q5043,"Bible,Christentum,Jesus,christentum,Small Broc...",25,Islam,Q432,"hadith,Marifa,idea,Ancient Semitic religion,Ne...",76,177019
2,2,Toko Yasuda,Q7813654,"Japan,Touch and Go Records,bass guitar,keyboar...",13,guitar,Q6607,"Instrument of the Year,neck,fretboard,mahogany...",32,piano,Q5994,"Bartolomeo Cristofori,kpf,Metropolitan Museum ...",11,74330
3,3,Autonomous University of Madrid,Q788091,European Alliance for Social Sciences and Huma...,9,Spain,Q29,"Cortes Generales,Extremaduran,spanyol,African ...",231,Sweden,Q34,"Scandinavian studies,svensk,African Developmen...",212,87615
4,4,Lyon,Q456,"International Cities of Refuge Network,Aleppo,...",47,Beirut,Q3820,"Athens,Yerevan,Rmeil,Bachoura,Mousseitbeh,Smal...",51,Manila,Q1461,"Navotas,Panama City,Cartagena de Indias,Ho Chi...",93,2376138


In [6]:
from tqdm import tqdm

model.eval()

for i in tqdm(range(len(df))):
    subject = df.loc[i, 'subject']
    sbj_hops = df.loc[i, 'sbj_one_hop']
    x = dict()
    for word in sbj_hops.split(','):
        
        # 확률 계산할 문장
        sentence = f"{subject} and {word}"
        # 토큰화
        inputs = tokenizer(sentence, return_tensors="pt").to('cuda')
        # 모델 출력 (logits)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
        # 소프트맥스: 확률 계산
        probs = torch.nn.functional.softmax(logits, dim=-1)
        # 각 토큰의 로그 확률 계산
        input_ids = inputs['input_ids']
        log_probs = torch.log(probs[0, torch.arange(input_ids.size(-1)), input_ids[0]])
        # 전체 문장의 로그 확률 계산
        total_log_prob = log_probs.sum().item()
        # 결과 출력
        x[word] = total_log_prob
        #print(f" '{sentence}': {total_log_prob}")
    sorted_keys = sorted(x, key=x.get, reverse=True)
    df.loc[i, 'sbj_one_hop'] = ','.join(sorted_keys)

df.to_excel("ex_f_pre2.xlsx", index = False)

  0%|          | 0/21782 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
  0%|          | 11/21782 [00:07<4:23:11,  1.38it/s]


KeyboardInterrupt: 

: 