In [18]:
!pip install transformers





In [19]:
import transformers
transformers.__version__

'4.8.2'

In [20]:
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

In [21]:
data = pd.read_csv("realdata.csv")

In [22]:
data.drop('Unnamed: 0' ,axis=1, inplace=True)
data.columns=['document','label']
data.head()

Unnamed: 0,document,label
0,"어, 청소 니가 대신 해 줘!",4
1,둘 다 청소 하기 싫어. 귀찮아.,4
2,둘 다 하기 싫어서 화내.,0
3,그럼 방세는 어떡해.,1
4,권태긴줄 알았는데 다른 사람이 생겼나보더라고.,1


In [23]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=0)

print(len(train_data)) #35192 출력 test_size가 0.2일 때
print(len(test_data)) #8799 출력

35192
8799


In [24]:
train_data = train_data.dropna(how = 'any') # Null 값이 존재하는 행 제거
train_data = train_data.reset_index(drop=True)
print(train_data.isnull().values.any()) # Null 값이 존재하는지 확인

False


In [25]:
test_data = test_data.dropna(how = 'any') # Null 값이 존재하는 행 제거
test_data = test_data.reset_index(drop=True)
print(test_data.isnull().values.any()) # Null 값이 존재하는지 확인

False


In [26]:
tokenizer = BertTokenizer.from_pretrained('klue/bert-base')

In [27]:
max_seq_len = 128

In [28]:
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):

    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []

    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)
        padding_count = input_id.count(tokenizer.pad_token_id)
        attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
        token_type_id = [0] * max_seq_len

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)

    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels

In [29]:
train_X, train_y = convert_examples_to_features(train_data['document'], train_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)

  0%|                                                                                        | 0/35192 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████████████████████████████████████████████████████████████████████| 35192/35192 [00:18<00:00, 1942.91it/s]


In [30]:
test_X, test_y = convert_examples_to_features(test_data['document'], test_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|████████████████████████████████████████████████████████████████████████████| 8799/8799 [00:04<00:00, 1909.76it/s]


In [31]:
#최대 길이: 128
input_id = train_X[0][0]
attention_mask = train_X[1][0]
token_type_id = train_X[2][0]
label = train_y[0]

print('단어에 대한 정수 인코딩 :',input_id)
print('어텐션 마스크 :',attention_mask)
print('세그먼트 인코딩 :',token_type_id)
print('각 인코딩의 길이 :', len(input_id))
print('정수 인코딩 복원 :',tokenizer.decode(input_id))
print('레이블 :',label)

단어에 대한 정수 인코딩 : [    2  5442 14023  2155  2088  6863  2116   864   814 15351   858  2203
    18  4224   720  3824  3611  2069  6490  3628  2069  3818  2088  1513
  2051    18     3     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
어텐션 마스크 : [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [32]:
model = TFBertModel.from_pretrained("klue/bert-base", from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'bert.embeddings.position_ids', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [33]:
max_seq_len = 128

In [34]:
input_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
attention_masks_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
token_type_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)

outputs = model([input_ids_layer, attention_masks_layer, token_type_ids_layer])

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


In [35]:
print(outputs)

TFBaseModelOutputWithPooling(last_hidden_state=<KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'tf_bert_model')>, pooler_output=<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'tf_bert_model')>, hidden_states=None, attentions=None)


In [None]:
'''
# TPU 작동을 위한 코드 TPU 작동을 위한 코드
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
'''

In [None]:
#strategy = tf.distribute.experimental.TPUStrategy(resolver)

In [None]:
'''
with strategy.scope():
  model = TFBertForSequenceClassification("klue/bert-base")
  optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
  loss = tf.keras.losses.BinaryCrossentropy()
  model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy'])
  '''

In [41]:
from transformers import TFBertForSequenceClassification
model = TFBertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=7, from_pt=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics = ['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
#model.fit(train_X, train_y, epochs=2, batch_size=64)

Epoch 1/2


AttributeError: in user code:

    File "C:\Users\hi\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\hi\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\hi\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\hi\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\training.py", line 1081, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\hi\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\training.py", line 1139, in compute_loss
        return self.compiled_loss(
    File "C:\Users\hi\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\hi\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\hi\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\hi\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\training.py", line 1139, in compute_loss  **
        return self.compiled_loss(
    File "C:\Users\hi\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\compile_utils.py", line 263, in __call__
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    File "C:\Users\hi\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\engine\compile_utils.py", line 840, in match_dtype_and_rank
        if (y_t.dtype.is_floating and y_p.dtype.is_floating) or (

    AttributeError: 'NoneType' object has no attribute 'dtype'


In [None]:
#results = model.evaluate(test_X, test_y, batch_size=1024)
#print("test loss, test acc: ", results)

In [43]:
from transformers import TFBertForSequenceClassification
model = TFBertForSequenceClassification.from_pretrained("saving_folder")

Some layers from the model checkpoint at saving_folder were not used when initializing TFBertForSequenceClassification: ['dropout_74']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at saving_folder.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [45]:
def sentiment_predict(new_sentence):
    input_id = tokenizer.encode(new_sentence, max_length=max_seq_len, pad_to_max_length=True)

    padding_count = input_id.count(tokenizer.pad_token_id)
    attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
    token_type_id = [0] * max_seq_len

    input_ids = np.array([input_id])
    attention_masks = np.array([attention_mask])
    token_type_ids = np.array([token_type_id])

    encoded_input = [input_ids, attention_masks, token_type_ids]
    score = list(model.predict(encoded_input)[0][0])

    result = score.index(max(score))

    if result == 0:
        return print(score, "\n 결과: ", result, " -> 분노")
    elif result == 1:
        return print(score, "\n 결과: ", result, " -> 슬픔")
    elif result == 2:
        return print(score, "\n 결과: ", result, " -> 공포")
    elif result == 3:
        return print(score, "\n 결과: ", result, " -> 역겨움")
    elif result == 4:
        return print(score, "\n 결과: ", result, " -> 중립")
    elif result == 5:
        return print(score, "\n 결과: ", result, " -> 놀라움")
    elif result == 6:
        return print(score, "\n 결과: ", result, " -> 행복")

In [None]:
'''
분노 0
슬픔 1
공포 2
역겨움 3
중립 4
놀라움 5
행복 6
'''

In [46]:
sentiment_predict("오늘 헤어져서 너무 슬퍼") #1이 제일 큼 -> 슬픔

[-0.9159959, 5.6754003, -2.5743465, -2.2595897, 1.3701192, -2.8120313, -1.0159005] 
 결과:  1  -> 슬픔


In [47]:
sentiment_predict("나 월급받아서 너무 좋아") #6이 제일 큼 -> 행복

[-1.4873897, 0.89078754, -2.5881941, -3.0522127, 2.4245567, -1.3927115, 3.779524] 
 결과:  6  -> 행복


In [48]:
sentiment_predict("뭐? 뷔랑 제니 사귄다고? 대박") #5가 제일 큼 -> 놀라움

[-1.4311104, 0.35705167, -1.0862205, -2.5811336, 1.7252817, 2.3951845, 0.9323495] 
 결과:  5  -> 놀라움


In [49]:
sentiment_predict("뒤에서 누가 자꾸 따라와") #0이 제일 큼 -> 분노 근데 공포가 나왔으면 좋겠음

[1.4606305, -0.16641602, 0.0649231, -0.73745155, 1.372321, -0.18250062, -1.97378] 
 결과:  0  -> 분노


In [50]:
sentiment_predict("뒤에서 누가 자꾸 따라와 무서워") #2가 제일 큼 -> 공포

[-1.5102355, 0.9171264, 4.3872776, -2.7098854, 1.9778476, -0.74672234, -1.8644539] 
 결과:  2  -> 공포


In [52]:
!pip install flask





In [None]:
from flask import Flask  ## flask 라이브러리에서 Flask import
app = Flask(__name__)
 
@app.route('/')
def hello_world():
    return 'Hello, World!'
 
if __name__ == "__main__":
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [10/Sep/2023 15:43:37] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [10/Sep/2023 15:43:37] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [10/Sep/2023 15:43:59] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [10/Sep/2023 15:44:14] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [10/Sep/2023 15:44:57] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [10/Sep/2023 15:45:09] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [10/Sep/2023 15:45:26] "GET / HTTP/1.1" 200 -
