In [3]:
import pandas as pd
from datasets import load_dataset
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments, AutoTokenizer, DistilBertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# 데이터셋 로드
dataset = load_dataset("dair-ai/emotion")

# 데이터프레임으로 변환 (데이터 확인용, 선택사항)
df_train = pd.DataFrame(dataset['train'])
df_validation = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])

print("Train set:")
print(df_train.head())

print("\nValidation set:")
print(df_validation.head())

print("\nTest set:")
print(df_test.head())

# 토크나이저 로드 및 데이터 전처리
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 데이터셋 크기를 40분의 1로 줄임
train_size = len(tokenized_datasets['train']) // 20
validation_size = len(tokenized_datasets['validation']) // 20
test_size = len(tokenized_datasets['test']) // 20

small_train_dataset = tokenized_datasets['train'].select(range(train_size))
small_validation_dataset = tokenized_datasets['validation'].select(range(validation_size))
small_test_dataset = tokenized_datasets['test'].select(range(test_size))

# 모델 로드
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6)

# 메트릭 계산 함수 정의
def compute_metrics(p):
    import numpy as np
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds, average='weighted')
    return {"accuracy": acc, "f1": f1}

# 훈련 인자 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# 트레이너 설정 및 조기 종료 콜백 추가
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_validation_dataset,
    compute_metrics=compute_metrics,  # 메트릭 함수 추가
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()

validation_results = trainer.evaluate()
print("\nValidation results:")
print(validation_results)

test_results = trainer.evaluate(eval_dataset=small_test_dataset)
print("\nTest results:")
print(test_results)

# Fine-tuning 후 모델을 저장
model.save_pretrained("./results")
tokenizer.save_pretrained("./results")

# 파이프라인을 사용하여 감정 분류
from transformers import pipeline

emotion_classifier = pipeline("text-classification", model="./results", tokenizer="./results", return_all_scores=True)

# 예시 텍스트에 대한 감정 분류
example_text = "im feeling quite sad and sorry for myself but ill snap out of it soon"
predictions = emotion_classifier(example_text)

print("\nPredictions:")
print(predictions)



Train set:
                                                text  label
0                            i didnt feel humiliated      0
1  i can go from feeling so hopeless to so damned...      0
2   im grabbing a minute to post i feel greedy wrong      3
3  i am ever feeling nostalgic about the fireplac...      2
4                               i am feeling grouchy      3

Validation set:
                                                text  label
0  im feeling quite sad and sorry for myself but ...      0
1  i feel like i am still looking at a blank canv...      0
2                     i feel like a faithful servant      2
3                  i am just feeling cranky and blue      3
4  i can have for a treat or if i am feeling festive      1

Test set:
                                                text  label
0  im feeling rather rotten so im not very ambiti...      0
1          im updating my blog because i feel shitty      0
2  i never make her separate from me because i do...      0
3

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/75 [00:00<?, ?it/s]

{'loss': 1.6547, 'grad_norm': 1.771509051322937, 'learning_rate': 8.666666666666667e-05, 'epoch': 0.4}
{'loss': 1.4732, 'grad_norm': 4.145961284637451, 'learning_rate': 7.333333333333333e-05, 'epoch': 0.8}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.045021414756775, 'eval_accuracy': 0.64, 'eval_f1': 0.5135115864527628, 'eval_runtime': 53.802, 'eval_samples_per_second': 1.859, 'eval_steps_per_second': 0.074, 'epoch': 1.0}
{'loss': 1.1747, 'grad_norm': 4.593270301818848, 'learning_rate': 6e-05, 'epoch': 1.2}
{'loss': 1.053, 'grad_norm': 11.470431327819824, 'learning_rate': 4.666666666666667e-05, 'epoch': 1.6}
{'loss': 0.8536, 'grad_norm': 5.1431989669799805, 'learning_rate': 3.3333333333333335e-05, 'epoch': 2.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.713198184967041, 'eval_accuracy': 0.79, 'eval_f1': 0.7525587027914614, 'eval_runtime': 42.0753, 'eval_samples_per_second': 2.377, 'eval_steps_per_second': 0.095, 'epoch': 2.0}
{'loss': 0.6229, 'grad_norm': 2.853569269180298, 'learning_rate': 2e-05, 'epoch': 2.4}
{'loss': 0.6304, 'grad_norm': 3.51495623588562, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.8}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.630265474319458, 'eval_accuracy': 0.82, 'eval_f1': 0.8105693804403483, 'eval_runtime': 44.4288, 'eval_samples_per_second': 2.251, 'eval_steps_per_second': 0.09, 'epoch': 3.0}
{'train_runtime': 3493.0857, 'train_samples_per_second': 0.687, 'train_steps_per_second': 0.021, 'train_loss': 1.033127899169922, 'epoch': 3.0}


  0%|          | 0/4 [00:00<?, ?it/s]


Validation results:
{'eval_loss': 0.630265474319458, 'eval_accuracy': 0.82, 'eval_f1': 0.8105693804403483, 'eval_runtime': 43.6794, 'eval_samples_per_second': 2.289, 'eval_steps_per_second': 0.092, 'epoch': 3.0}


  0%|          | 0/4 [00:00<?, ?it/s]


Test results:
{'eval_loss': 0.8035635948181152, 'eval_accuracy': 0.71, 'eval_f1': 0.6939156684911834, 'eval_runtime': 45.305, 'eval_samples_per_second': 2.207, 'eval_steps_per_second': 0.088, 'epoch': 3.0}

Predictions:
[[{'label': 'LABEL_0', 'score': 0.7109132409095764}, {'label': 'LABEL_1', 'score': 0.02293119952082634}, {'label': 'LABEL_2', 'score': 0.029121406376361847}, {'label': 'LABEL_3', 'score': 0.11977823078632355}, {'label': 'LABEL_4', 'score': 0.09192276000976562}, {'label': 'LABEL_5', 'score': 0.025333072990179062}]]




In [None]:
from transformers import pipeline, AutoTokenizer

# 분류기와 토크나이저 로드
classifier = pipeline("text-classification", model='./results', tokenizer='./results', return_all_scores=True)
tokenizer = AutoTokenizer.from_pretrained("./results")

# 긴 텍스트를 나누는 함수
def chunk_text(text, tokenizer, chunk_size=510):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    return [tokenizer.decode(chunk) for chunk in chunks]

# 긴 텍스트를 나눠 처리하고 결과를 결합하는 함수
def classify_long_text(text, classifier, tokenizer):
    chunks = chunk_text(text, tokenizer)
    all_scores = []
    
    for chunk in chunks:
        prediction = classifier(chunk)
        all_scores.extend(prediction)  # 각 청크의 예측을 결합
    
    return all_scores

# 긴 텍스트
lyrics = """I'm at a payphone trying to call home All of my change I've spent on you Where are the times gone baby It's all wrong, where are the plans we made for two Yeah, I, I know it's hard to remember The people we used to be It's even harder to picture That you're not here next to me You said it's too late to make it But is it too late to try? And then that time that you wasted All of our bridges burned down I've wasted my nights You turned out the lights Now I'm paralyzed Still stucked in that time when we called it love But even the sun sets in paradise I'm at a payphone trying to call home All of my change I've spent on you Where are the times gone baby It's all wrong, where are the plans we made for two If happy ever after did exist I would still be holding you like this And all those fairytales are full of shit One more fucking love song I'll be sick You turned your back on tomorrow Cause you forgot yesterday I gave you my love to borrow But you just gave it away You can't expect me to be a fine I don't expect you to care I know I said it before But all of our bridges burnt down I've wasted my nights You turned out the lights Now I'm paralyzed Still stucked in that time when we called it love But even the sun sets in paradise I'm at a payphone trying to call home All of my change I've spent on you Where are the times gone baby It's all wrong, where are the plans we made for two If happy ever after did exist I would still be holding you like this And all those fairytales are full of shit One more fucking love song I'll be sick Now I'm at a payphone Man fuck that shit I'll be right here spending all this money while you sitting round Wondering why wasn't you who came out from nothing Made it from the botton Now when you see me I'm struting And all of my cause a way to push up a button Telling me the chances I blew up or whatever you call it Switched the number to my phone So you never can call it Don't need my name, or my show You can tell it I'm ballin' Shish, what a shame coulda got picked Had a really good game but you missed your last shot So you talk about who you see at the top Or what you could've saw But sad to say it's over for it Phantom roll out valet open doors Where's the car way, got what you was looking for Now ask me who they want So you can go take that little piece of shit with you I'm at a payphone trying to call home All of my change I've spent on you Where are the times gone baby It's all wrong, where are the plans we made for two If happy ever after did exist I would still be holding you like this And all these fairytales are full of shit One more fucking love song I'll be sick Now I'm at a payphone"""

# 긴 텍스트 분류 및 결과 출력
predictions = classify_long_text(lyrics, classifier, tokenizer)
print(predictions)


Token indices sequence length is longer than the specified maximum sequence length for this model (656 > 512). Running this sequence through the model will result in indexing errors


[[{'label': 'LABEL_0', 'score': 0.9979704022407532}, {'label': 'LABEL_1', 'score': 0.0004122512764297426}, {'label': 'LABEL_2', 'score': 0.0005707133677788079}, {'label': 'LABEL_3', 'score': 0.00036665392690338194}, {'label': 'LABEL_4', 'score': 0.0002818174834828824}, {'label': 'LABEL_5', 'score': 0.00039819441735744476}], [{'label': 'LABEL_0', 'score': 0.9980321526527405}, {'label': 'LABEL_1', 'score': 0.0003956361033488065}, {'label': 'LABEL_2', 'score': 0.0005570072680711746}, {'label': 'LABEL_3', 'score': 0.0003561170888133347}, {'label': 'LABEL_4', 'score': 0.00027176193543709815}, {'label': 'LABEL_5', 'score': 0.0003873667155858129}]]


In [11]:
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# 모델과 토크나이저 로드
model_name = "bhadresh-savani/bert-base-uncased-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# 긴 텍스트를 나누는 함수
def chunk_text(text, tokenizer, chunk_size=510):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    return [tokenizer.decode(chunk) for chunk in chunks]

# 긴 텍스트를 나눠 처리하고 결과를 결합하는 함수
def classify_long_text(text, classifier, tokenizer):
    chunks = chunk_text(text, tokenizer)
    all_scores = []

    for chunk in chunks:
        prediction = classifier(chunk)
        all_scores.extend(prediction)  # 각 청크의 예측을 결합
    
    # 결과를 감정별로 합치기
    final_scores = {}
    chunk_count = {label['label']: 0 for label in all_scores[0]}

    for score_list in all_scores:
        for score in score_list:
            label = score['label']
            if label not in final_scores:
                final_scores[label] = 0
            final_scores[label] += score['score']
            chunk_count[label] += 1

    # 총 점수를 평균 점수로 변환
    for label in final_scores:
        final_scores[label] /= chunk_count[label]

    return final_scores

# 긴 텍스트
lyrics = """I'm at a payphone trying to call home All of my change I've spent on you Where are the times gone baby It's all wrong, where are the plans we made for two Yeah, I, I know it's hard to remember The people we used to be It's even harder to picture That you're not here next to me You said it's too late to make it But is it too late to try? And then that time that you wasted All of our bridges burned down I've wasted my nights You turned out the lights Now I'm paralyzed Still stucked in that time when we called it love But even the sun sets in paradise I'm at a payphone trying to call home All of my change I've spent on you Where are the times gone baby It's all wrong, where are the plans we made for two If happy ever after did exist I would still be holding you like this And all those fairytales are full of shit One more fucking love song I'll be sick You turned your back on tomorrow Cause you forgot yesterday I gave you my love to borrow But you just gave it away You can't expect me to be a fine I don't expect you to care I know I said it before But all of our bridges burnt down I've wasted my nights You turned out the lights Now I'm paralyzed Still stucked in that time when we called it love But even the sun sets in paradise I'm at a payphone trying to call home All of my change I've spent on you Where are the times gone baby It's all wrong, where are the plans we made for two If happy ever after did exist I would still be holding you like this And all those fairytales are full of shit One more fucking love song I'll be sick Now I'm at a payphone Man fuck that shit I'll be right here spending all this money while you sitting round Wondering why wasn't you who came out from nothing Made it from the botton Now when you see me I'm struting And all of my cause a way to push up a button Telling me the chances I blew up or whatever you call it Switched the number to my phone So you never can call it Don't need my name, or my show You can tell it I'm ballin' Shish, what a shame coulda got picked Had a really good game but you missed your last shot So you talk about who you see at the top Or what you could've saw But sad to say it's over for it Phantom roll out valet open doors Where's the car way, got what you was looking for Now ask me who they want So you can go take that little piece of shit with you I'm at a payphone trying to call home All of my change I've spent on you Where are the times gone baby It's all wrong, where are the plans we made for two If happy ever after did exist I would still be holding you like this And all these fairytales are full of shit One more fucking love song I'll be sick Now I'm at a payphone"""

# 파이프라인을 사용하여 감정 분류
emotion_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

# 긴 텍스트 분류 및 결과 출력
predictions = classify_long_text(lyrics, emotion_classifier, tokenizer)
print(predictions)


Token indices sequence length is longer than the specified maximum sequence length for this model (656 > 512). Running this sequence through the model will result in indexing errors


{'sadness': 0.6237108707427979, 'joy': 0.3251424548216164, 'love': 0.01956860619247891, 'anger': 0.01791606517508626, 'fear': 0.01211931649595499, 'surprise': 0.0015426334575749934}
