# 0. Install & import libraries

In [1]:
# 0. Install & import libraries
from IPython.display import clear_output 

!pip install transformers==4.8.2
!pip install sentencepiece==0.1.96
!pip install tensorflow_addons
!pip install dill==0.2.8.2

clear_output()

import sklearn
import tensorflow
import transformers
import tensorflow_addons

print(sklearn.__version__) # 1.0.2 
print(tensorflow.__version__) # 2.8.0  
print(transformers.__version__) # 4.8.2  
print(tensorflow_addons.__version__) # 0.16.1 

import sentencepiece

import pandas as pd
import numpy as np

import os
import re
import pickle 
import dill # for saving a function as a file(.pkl)
import logging # for changing the tf's logging level
import urllib.request
from tqdm import tqdm

from sklearn import model_selection
from sklearn.metrics import accuracy_score

import tensorflow as tf
import tensorflow_addons as tfa # for using Rectified-Adam optimizer (instead of Adam optimizer) 
from tensorflow.keras import layers, initializers, losses, optimizers, metrics, callbacks

import transformers
from transformers import TFBertModel # BertTokenizer 제외

import sentencepiece as spm
import ast  # str list to list

 The versions of TensorFlow you are currently using is 2.12.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


1.2.2
2.12.0
4.8.2
0.19.0


In [2]:
# Random seed 고정
tf.random.set_seed(1234)
np.random.seed(1234)

# Transformers logging level 변경 (WARNING -> ERROR) @ https://huggingface.co/transformers/main_classes/logging.html
transformers.logging.set_verbosity(transformers.logging.ERROR)

# Tensorflow logging level 변경 
tf.get_logger().setLevel(logging.ERROR)

In [3]:
#KoBERT 불러오기
!git clone https://github.com/monologg/KoBERT-Transformers.git
!mv KoBERT-Transformers/kobert_transformers/tokenization_kobert.py /content
clear_output() # clear the output

#토크나이저 생성
from tokenization_kobert import KoBertTokenizer 

tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

Downloading:   0%|          | 0.00/371k [00:00<?, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

Downloading:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

# Import Google Drive

In [4]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [5]:
os.getcwd()

'/content'

In [6]:
# 경로만 바꾸면 됨!
path = '/gdrive/MyDrive/omz'# google drive project path
data_path = 'colab_data/temp_data/' # model data path
checkpoint_path = 'colab_data/saved_models/' # saved model path

os.chdir(path)

# Load the Data

In [7]:
df_allsongs_original = pd.read_csv('song_data/df_allsongs.csv', lineterminator='\n')

In [8]:
df_allsongs_original.head(3)

Unnamed: 0.1,Unnamed: 0,Title,SongID,Artist,Date,Genre,Lyric,preprocess_Lyric_ver2,preprocess_Lyric_ver3
0,0,사랑하기 싫어,36206208,지아,2023.02.24,발라드,"<div class=""lyric"" id=""d_video_summary""><!-- h...",숨을 내쉴 때마다 눈을 감을 때마다 니 생각이 나 먹먹해진다 하루 열두 시간이라면 ...,"['숨을 내쉴 때마다', '눈을 감을 때마다', '니 생각이 나 먹먹해진다', '하..."
1,1,나비무덤,36235518,포맨 (4MEN),2023.03.01,발라드,"<div class=""lyric"" id=""d_video_summary""><!-- h...",Standing beside you as sleep Wipe my tears as ...,"['Standing beside you as sleep', 'Wipe my tear..."
2,2,사랑인가 봐,34657844,멜로망스,2022.02.18,"발라드, 국내드라마","<div class=""lyric"" id=""d_video_summary""><!-- h...",너와 함께 하고 싶은 일들을 상상하는 게 요즘 내 일상이 되고 너의 즐거워하는 모습...,"['너와 함께 하고 싶은 일들을', '상상하는 게', '요즘 내 일상이 되고', '..."


In [9]:
df_allsongs = df_allsongs_original[["Title", "Artist", "Date", "Genre", "preprocess_Lyric_ver3"]][1750:2000]
df_allsongs

Unnamed: 0,Title,Artist,Date,Genre,preprocess_Lyric_ver3
1750,디카페인(decaffeine) (Feat. BB),Option,2022.08.20,"R&B/Soul, 인디음악","['이제 내 차례야 비켜 비켜 디카페인', '예전 것들은 잊어 잊어 lady', '..."
1751,달라진 니 마음,정효빈,2019.09.05,"발라드, 인디음악","['더 가지 말자 우리 여기까지만', '니 식어버린 마음들이', '날 초라하게 해 ..."
1752,깜빡 (Feat. 카더가든),유라 (youra),2019.03.05,"인디음악, 일렉트로니카","['깜빡 하는 가로등 속에', '우리 둘이 안고 있어요', '예쁜 잔에 먹고 싶은 ..."
1753,25,볼빨간사춘기,2019.09.10,"인디음악, 록/메탈","[""It's kinda lonely lonely"", '사람들 틈에 껴 있는 거', ..."
1754,입춘,한로로,2022.03.14,"인디음악, 록/메탈","['얼어붙은 마음에', '누가 입 맞춰줄까요', '봄을 기다린다는 말', '그 말의..."
...,...,...,...,...,...
1995,그냥 (Just),Zion.T,2015.02.02,R&B/Soul,"['내가 안쓰러워 보여', '인사하는 거라면', '내 마음 다칠까 걱정 말고', '..."
1996,사진을 보다가,바이브,2003.11.28,R&B/Soul,"['사진을 보다가 한쪽을 찢었어 ', '지금 우리처럼 한쪽을 찢었어 ', '난 남자..."
1997,지금이 아닌데,BIG Naughty (서동현),2022.11.18,"R&B/Soul, 국내드라마","['결국 이렇게 다', '부질없게', '만들 거였다면', '그때 알았다면', '결국..."
1998,Bambi,백현 (BAEKHYUN),2021.03.30,R&B/Soul,"['Feel it like 메말라 있던 맘 위로', '스며든 단비', '답이 필요 ..."


In [10]:
print(df_allsongs.info())
print(df_allsongs.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 1750 to 1999
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Title                  250 non-null    object
 1   Artist                 250 non-null    object
 2   Date                   250 non-null    object
 3   Genre                  250 non-null    object
 4   preprocess_Lyric_ver3  250 non-null    object
dtypes: object(5)
memory usage: 9.9+ KB
None
            Title             Artist        Date     Genre  \
count         250                250         250       250   
unique        249                150         217         9   
top     Vancouver  BIG Naughty (서동현)  2023.02.28  R&B/Soul   
freq            2                 15           4       159   

                                    preprocess_Lyric_ver3  
count                                                 250  
unique                                                250  

In [11]:
df_allsongs.preprocess_Lyric_ver3.iloc[12]

"['내일이면 나를 버릴 사람들 ', '걱정하는게 아니에요 ', '내일이면 난 다시  ', '바다 건너에 홀로 남을  ', '그대는 괜찮나요 ', '내 귓가에 노래를 불러 넣어줘요 ', '다른 새소리가 들려오지 않게 ', '유일했던 사랑을 두고 가는 내게 ', '숨겨뒀던 손수건을 흔들어줘요 ', 'hey let your bright light shine  ', 'on me ', 'can you love me unconditionally', 'and sing a million lullabies  ', 'on a sleepy day ', 'hey let your sea breeze blow on me ', 'when i am sailing internationally ', 'and whisper all your prayers  ', 'on a stormy day ', '그대 입안에  ', '내 숨을 불어 넣어줬죠 ', '그 작은 심장이 내려앉을 때마다 ', '내일이면 날 잡을 수도 없어요 ', '홀로 남을 그대는 괜찮나요', 'hey let your bright light shine  ', 'on me ', 'can you love me unconditionally ', 'and sing a million lullabies  ', 'on a sleepy day ', 'hey let your sea breeze blow on me ', 'when i am sailing internationally ', 'and whisper all your prayers  ', 'on a stormy day ', 'hey 눈을 붉혀선 안돼요 ', '우리 다시 만나는 날에는 ', '같이 늙고 싶다고 ', '약 약속을 해줄께요', 'hey let your bright light shine ', 'on me', 'can you love me unconditionally', 'and sing a million lullabies ', 'on a sleepy day', 'hey let you

# Load saved Model

In [13]:
def create_model(max_length=300):

    bert_base_model = TFBertModel.from_pretrained("monologg/kobert", from_pt=True) 

    input_token_ids   = layers.Input((max_length,), dtype=tf.int32, name='input_token_ids')   # tokens_tensor
    input_masks       = layers.Input((max_length,), dtype=tf.int32, name='input_masks')       # masks_tensor
    input_segments    = layers.Input((max_length,), dtype=tf.int32, name='input_segments')    # segments_tensor  

    bert_outputs = bert_base_model([input_token_ids, input_masks, input_segments]) 
    

    bert_outputs = bert_outputs[1] # ('pooler_output', )
    bert_outputs = layers.Dropout(0.2)(bert_outputs)
    final_output = layers.Dense(units=4, activation='softmax', kernel_initializer=initializers.TruncatedNormal(stddev=0.02), name="classifier")(bert_outputs)

    model = tf.keras.Model(inputs=[input_token_ids, input_masks, input_segments], 
                        outputs=final_output)

    model.compile(optimizer=tfa.optimizers.RectifiedAdam(learning_rate=1e-5, weight_decay=0.0025, warmup_proportion=0.05),
                  loss=losses.SparseCategoricalCrossentropy(), 
                  metrics=[metrics.SparseCategoricalAccuracy()])
    
    return model

In [14]:
# 1) Load the Model-builder (function)
with open(data_path + 'model_BERTfunction_v1.pkl', 'rb') as f:
    create_model = pickle.load(f) # use dill to pickle a python function

# 2) Load the Bert-tokenizer 
with open(data_path + 'tokenizer-bert.pkl', 'rb') as f:
    tokenizer = pickle.load(f) 


# 3) Create the model & load the Model-weights (from checkpoint file)
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver) 
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.TPUStrategy(resolver) # Obsolete : tf.distribute.experimental.TPUStrategy()

with strategy.scope(): 
    model = create_model(max_length=300)

model.load_weights(filepath=checkpoint_path + 'best_bert_weights.h5')

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


Downloading:   0%|          | 0.00/426 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/369M [00:00<?, ?B/s]

# Classifier as a function

In [15]:
def predict_sentiment_element(sentence, tokenizer, model):

    # 가사 내 문장 별 확률 값 추출

    SEQ_LEN = 300 

    # Tokenizing / Tokens to sequence numbers / Padding
    encoded_dict = tokenizer.encode_plus(text=re.sub("[^\s0-9a-zA-Zㄱ-ㅎㅏ-ㅣ가-힣]", "", sentence),
                                         padding='max_length', 
                                         truncation = True,
                                         max_length=SEQ_LEN) 
    
    token_ids = np.array(encoded_dict['input_ids']).reshape(1, -1) 
    token_masks = np.array(encoded_dict['attention_mask']).reshape(1, -1)
    token_segments = np.array(encoded_dict['token_type_ids']).reshape(1, -1)
    
    new_inputs = (token_ids, token_masks, token_segments)

    # Prediction
    pred = model.predict(new_inputs) 
    all_pred_proba = np.round(pred * 100, 2)

    #top_pred_proba = np.round(np.max(pred) * 100, 2) 
    #pred_class = ['분노혐오','놀람공포','슬픔','행복'][np.argmax(pred, axis=1)[0]] # 수정하면 4개 클래스 다 나올듯! 
    #print(\"{}% 확률로 {} 텍스트입니다.\".format(top_pred_proba, pred_class))

    return all_pred_proba

In [16]:
def predict_sentiment_lyrics(lyrics):
    
    # 가사의 감성 확률 값 추출, 문장별 감성 확률의 정규화 평균  
    # 가사내 문장 단위로 predict_sentiment_element 함수 적용   
    # 마지막 원소 '' 빈칸임. <br/> 태그 기준으로 split해서 나옴, 빈칸인 element 제거
    
    lyrics = ast.literal_eval(lyrics) # str list to list
    lyrics = list(filter(lambda x: x != '', lyrics))

    print(len(lyrics)) 
    print(lyrics)


    for i in range(len(lyrics)):
        
        if lyrics[i] != '':
            all_pred_proba = predict_sentiment_element(lyrics[i], tokenizer, model)
            anger = all_pred_proba[0][0]
            scary = all_pred_proba[0][1]
            sad = all_pred_proba[0][2]
            happy = all_pred_proba[0][3]
            #print(i, lyrics[i], all_pred_proba) 

        anger += anger 
        scary += scary
        sad  += sad
        happy += happy

    total_array = np.ndarray.round( (np.array([(anger, scary, sad, happy)]) / (anger + scary + sad + happy)), 2)
    top_pred_prob = total_array[0][np.argmax(total_array, axis=1)[0]]
    top_pred_class = ['분노혐오','놀람공포','슬픔','행복'][np.argmax(total_array, axis=1)[0]]

    return total_array, top_pred_class, top_pred_prob

# Apply model

In [20]:
df_allsongs['Emotion'] = ""
df_allsongs['Probability'] = ""
df_allsongs['분노혐오'] = ""
df_allsongs['놀람공포'] = ""
df_allsongs['슬픔'] = ""
df_allsongs['행복'] = ""

for i in tqdm(df_allsongs.index):
    lyrics = df_allsongs['preprocess_Lyric_ver3'].iloc[i]
    total_array, top_pred_class, top_pred_prob = predict_sentiment_lyrics(lyrics)
    df_allsongs['Emotion'].iloc[i] = top_pred_class 
    df_allsongs['Probability'].iloc[i] = top_pred_prob
    df_allsongs['분노혐오'].iloc[i] = total_array[0][0]
    df_allsongs['놀람공포'].iloc[i] = total_array[0][1]
    df_allsongs['슬픔'].iloc[i] = total_array[0][2]
    df_allsongs['행복'].iloc[i] = total_array[0][3]

    df_allsongs.to_csv('songtagged_1750_2000.csv')

clear_output()