From 7cd81552fe28cd27040e6f80474693ca588f9bad Mon Sep 17 00:00:00 2001
From: Jungae Park <46885199+jungae-park@users.noreply.github.com>
Date: Thu, 16 Feb 2023 17:48:36 +0900
Subject: [PATCH 1/7] Create nlp_imdb_review_inference.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rnn, lstm 모델에 대해 imdb 데이터 중 영화 긍정 혹은 부정 리뷰 문장 추론 코드
---
 CNN/nlp_imdb_review_inference.py | 40 ++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 CNN/nlp_imdb_review_inference.py
diff --git a/CNN/nlp_imdb_review_inference.py b/CNN/nlp_imdb_review_inference.py
new file mode 100644
index 0000000..3f542fa
--- /dev/null
+++ b/CNN/nlp_imdb_review_inference.py
@@ -0,0 +1,40 @@
+import tensorflow as tf
+import numpy as np
+import re
+from tensorflow.keras.datasets import imdb
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+
+model = tf.keras.models.load_model('./model/rnn_imdb_model.h5')
+
+negative_input = "This movie was just way too overrated. The fighting was not professional and in slow motion. I was expecting more from a 200 million budget movie. The little sister of T.Challa was just trying too hard to be funny. The story was really dumb as well. Don't watch this movie if you are going because others say its great unless you are a Black Panther fan or Marvels fan."
+
+positive_input = "I was lucky enough to be included in the group to see the advanced screening in Melbourne on the 15th of April, 2012. And, firstly, I need to say a big thank-you to Disney and Marvel Studios. \
+Now, the film... how can I even begin to explain how I feel about this film? It is, as the title of this review says a 'comic book triumph'. I went into the film with very, very high expectations and I was not disappointed. \
+Seeing Joss Whedon's direction and envisioning of the film come to life on the big screen is perfect. The script is amazingly detailed and laced with sharp wit a humor. The special effects are literally mind-blowing and the action scenes are both hard-hitting and beautifully choreographed."
+
+word_to_index = imdb.get_word_index()
+
+# 알파벳과 숫자를 제외하고 모두 제거 및 알파벳 소문자화
+new_sentence = re.sub('[^0-9a-zA-Z ]', '', positive_input).lower()
+encoded = []
+
+# 띄어쓰기 단위 토큰화 후 정수 인코딩
+for word in new_sentence.split():
+    try :
+    # 단어 집합의 크기를 10,000으로 제한.
+     if word_to_index[word] <= 10000:
+        encoded.append(word_to_index[word]+3)
+     else:
+    # 10,000 이상의 숫자는 <unk> 토큰으로 변환.
+        encoded.append(2)
+    # 단어 집합에 없는 단어는 <unk> 토큰으로 변환.
+    except KeyError:
+      encoded.append(2)
+
+pad_sequence = pad_sequences([encoded], maxlen=500)
+score = float(model.predict(pad_sequence)) # 예측
+
+if(score > 0.5):
+    print("{:.2f}% 확률로 긍정 리뷰입니다.".format(score * 100))
+else:
+    print("{:.2f}% 확률로 부정 리뷰입니다.".format((1 - score) * 100))

From 65a411553b94c10b63785558b65210759ccf727d Mon Sep 17 00:00:00 2001
From: Jungae Park <46885199+jungae-park@users.noreply.github.com>
Date: Fri, 17 Feb 2023 14:01:38 +0900
Subject: [PATCH 2/7] Create nlp_raw_inference.py

nlp raw sentence inference
---
 NLP/nlp_raw_inference.py | 99 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 NLP/nlp_raw_inference.py

diff --git a/NLP/nlp_raw_inference.py b/NLP/nlp_raw_inference.py
new file mode 100644
index 0000000..717607b
--- /dev/null
+++ b/NLP/nlp_raw_inference.py
@@ -0,0 +1,99 @@
+import tensorflow as tf
+import numpy as np
+import time
+import pandas as pd
+
+# Check GPU Availability
+device_name = tf.test.gpu_device_name()
+if not device_name:
+    print('Cannot found GPU. Training with CPU')
+else:
+    print('Found GPU at :{}'.format(device_name))
+
+# 전역 변수 설정
+model = None
+load_model_time = None
+X_test = None
+
+  
+# 모델 로드
+def load_model(saved_model_dir):
+
+  global load_model_time
+  global model
+  
+  load_model_time = time.time()
+  model = tf.keras.models.load_model(saved_model_dir)
+  load_model_time = time.time() - load_model_time
+
+# 테스트 데이터를 배치 단위로 제공
+def load_test_batch(batch_size):
+  
+  global X_test
+  
+  num_words = 15000
+  maxlen = 130
+
+  (X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=num_words)
+  
+  X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test,
+                                                        value= 0,
+                                                        padding = 'pre',
+                                                        maxlen = maxlen )
+  
+  test_batch = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)
+
+  return test_batch
+
+def inference(batch_size):	
+
+  # 전체 데이터에 대한 예측라벨 및 실제라벨 저장
+  pred_labels = []
+  real_labels = []
+
+  
+  # 배치 단위의 테스트 데이터 로드
+  load_dataset_time = time.time()
+  test_batch = load_test_batch(batch_size)
+  load_dataset_time = time.time() - load_dataset_time
+
+  # 전체 데이터에 대한 추론 시작
+  inference_time = time.time()
+  # 전체 데이터를 배치 단위로 묶어서 사용 (반복문 한번당 배치 단위 추론 한번)
+  for i, (X_test_batch, y_test_batch) in enumerate(test_batch):
+      
+      # 배치 단위별 데이터셋 분류
+      raw_inference_start = time.time()
+      y_pred_batch = model(X_test_batch)
+      raw_inference_time = time.time() - raw_inference_start
+    
+      # 배치 사이즈 만큼의 실제 라벨 저장
+      real_labels.extend(y_test_batch.numpy())
+      # 배치 사이즈 만큼의 예측 라벨 저장
+      y_pred_batch = np.where(y_pred_batch > 0.5, 1, 0)
+      y_pred_batch = y_pred_batch.reshape(-1)
+      pred_labels.extend(y_pred_batch)
+
+      break 
+  inference_time = time.time() - inference_time
+
+  # 모든 데이터에 대한 실제라벨과 예측라벨을 비교한 뒤, 정확도 계산
+  accuracy = np.sum(np.array(real_labels) == np.array(pred_labels))/len(real_labels)
+  
+  print('accuracy' , accuracy) 
+  print('load_model_time', load_model_time) 
+  print('load_dataset_time' , load_dataset_time)
+  print('total_inference_time', inference_time) 
+  print('raw_inference_time', raw_inference_time / len(pred_labels))
+  print('ips' , len(pred_labels) / (load_model_time + load_dataset_time + inference_time))
+  print('ips(inf)' , len(pred_labels) / inference_time)
+
+# 모델이 저장되어있는/저장할 경로
+model_name = 'rnn_imdb'
+saved_model_dir=f'./model/{model_name}_model.h5'
+
+# 저장되어있는 모델이 있다면, 로드
+load_model(saved_model_dir)
+
+# 배치 단위로 추론
+inference(1)

From 44d4525dc34ee14ab5a37de49f8cef157aff8f22 Mon Sep 17 00:00:00 2001
From: Jungae Park <46885199+jungae-park@users.noreply.github.com>
Date: Mon, 6 Mar 2023 13:34:31 +0900
Subject: [PATCH 3/7] Update nlp_raw_inference.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

raw data 전처리 방법 변경
---
 NLP/nlp_raw_inference.py | 114 ++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 63 deletions(-)

diff --git a/NLP/nlp_raw_inference.py b/NLP/nlp_raw_inference.py
index 717607b..c5afc88 100644
--- a/NLP/nlp_raw_inference.py
+++ b/NLP/nlp_raw_inference.py
@@ -3,30 +3,10 @@
 import time
 import pandas as pd
 
-# Check GPU Availability
-device_name = tf.test.gpu_device_name()
-if not device_name:
-    print('Cannot found GPU. Training with CPU')
-else:
-    print('Found GPU at :{}'.format(device_name))
-
-# 전역 변수 설정
-model = None
-load_model_time = None
 X_test = None
 
-  
-# 모델 로드
-def load_model(saved_model_dir):
 
-  global load_model_time
-  global model
   
-  load_model_time = time.time()
-  model = tf.keras.models.load_model(saved_model_dir)
-  load_model_time = time.time() - load_model_time
-
-# 테스트 데이터를 배치 단위로 제공
 def load_test_batch(batch_size):
   
   global X_test
@@ -44,56 +24,64 @@ def load_test_batch(batch_size):
   test_batch = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)
 
   return test_batch
+  
+batch_size=1
+test_batch = load_test_batch(batch_size)
 
-def inference(batch_size):	
+load_dataset_time = time.time()
+for i, (X_test_batch, y_test_batch) in enumerate(test_batch):
+    raw_data=X_test_batch
+    break
+load_dataset_time = time.time() - load_dataset_time
 
-  # 전체 데이터에 대한 예측라벨 및 실제라벨 저장
-  pred_labels = []
-  real_labels = []
+print(raw_data)
+
+model = None
+load_model_time = None
+
+
+# 모델 로드
+def load_model(saved_model_dir):
+
+  global load_model_time
+  global model
+
+  load_model_time = time.time()
+  model = tf.keras.models.load_model(saved_model_dir)
+  load_model_time = time.time() - load_model_time
 
-  
-  # 배치 단위의 테스트 데이터 로드
-  load_dataset_time = time.time()
-  test_batch = load_test_batch(batch_size)
-  load_dataset_time = time.time() - load_dataset_time
-
-  # 전체 데이터에 대한 추론 시작
-  inference_time = time.time()
-  # 전체 데이터를 배치 단위로 묶어서 사용 (반복문 한번당 배치 단위 추론 한번)
-  for i, (X_test_batch, y_test_batch) in enumerate(test_batch):
-      
-      # 배치 단위별 데이터셋 분류
-      raw_inference_start = time.time()
-      y_pred_batch = model(X_test_batch)
-      raw_inference_time = time.time() - raw_inference_start
-    
-      # 배치 사이즈 만큼의 실제 라벨 저장
-      real_labels.extend(y_test_batch.numpy())
-      # 배치 사이즈 만큼의 예측 라벨 저장
-      y_pred_batch = np.where(y_pred_batch > 0.5, 1, 0)
-      y_pred_batch = y_pred_batch.reshape(-1)
-      pred_labels.extend(y_pred_batch)
-
-      break 
-  inference_time = time.time() - inference_time
-
-  # 모든 데이터에 대한 실제라벨과 예측라벨을 비교한 뒤, 정확도 계산
-  accuracy = np.sum(np.array(real_labels) == np.array(pred_labels))/len(real_labels)
-  
-  print('accuracy' , accuracy) 
-  print('load_model_time', load_model_time) 
-  print('load_dataset_time' , load_dataset_time)
-  print('total_inference_time', inference_time) 
-  print('raw_inference_time', raw_inference_time / len(pred_labels))
-  print('ips' , len(pred_labels) / (load_model_time + load_dataset_time + inference_time))
-  print('ips(inf)' , len(pred_labels) / inference_time)
 
 # 모델이 저장되어있는/저장할 경로
 model_name = 'rnn_imdb'
 saved_model_dir=f'./model/{model_name}_model.h5'
 
 # 저장되어있는 모델이 있다면, 로드
+load_model_time = time.time()
 load_model(saved_model_dir)
-
-# 배치 단위로 추론
-inference(1)
+load_model_time = time.time() - load_model_time
+
+inference_time = time.time()
+pred_labels = []
+real_labels = []
+
+raw_inference_start = time.time()
+y_pred_batch = model(raw_data)
+raw_inference_time = time.time() - raw_inference_start
+
+# 배치 사이즈 만큼의 실제 라벨 저장
+real_labels.extend(y_test_batch.numpy())
+# 배치 사이즈 만큼의 예측 라벨 저장
+y_pred_batch = np.where(y_pred_batch > 0.5, 1, 0)
+y_pred_batch = y_pred_batch.reshape(-1)
+pred_labels.extend(y_pred_batch)
+
+accuracy = np.sum(np.array(real_labels) == np.array(pred_labels))/len(real_labels)
+inference_time = time.time() - inference_time
+
+print('accuracy' , accuracy)
+print('load_model_time', load_model_time)
+print('load_dataset_time' , load_dataset_time)
+print('total_inference_time', inference_time)
+print('raw_inference_time', raw_inference_time / len(pred_labels))
+print('ips' , len(pred_labels) / (load_model_time + load_dataset_time + inference_time))
+print('ips(inf)' , len(pred_labels) / inference_time)

From 3b2ba15dabebd0aa11f84546387fb0c99a6924a5 Mon Sep 17 00:00:00 2001
From: Jungae Park <46885199+jungae-park@users.noreply.github.com>
Date: Tue, 7 Mar 2023 15:28:55 +0900
Subject: [PATCH 4/7] Rename nlp_raw_inference.py to rnn_lstm_raw_inference.py

file name change
---
 NLP/{nlp_raw_inference.py => rnn_lstm_raw_inference.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename NLP/{nlp_raw_inference.py => rnn_lstm_raw_inference.py} (100%)

diff --git a/NLP/nlp_raw_inference.py b/NLP/rnn_lstm_raw_inference.py
similarity index 100%
rename from NLP/nlp_raw_inference.py
rename to NLP/rnn_lstm_raw_inference.py

From 4a044d12ca338da7e6d857fda4e05fdfade85dbc Mon Sep 17 00:00:00 2001
From: Jungae Park <46885199+jungae-park@users.noreply.github.com>
Date: Wed, 8 Mar 2023 15:44:44 +0900
Subject: [PATCH 5/7] Update rnn_lstm_raw_inference.py

code update
---
 NLP/rnn_lstm_raw_inference.py | 71 +++++++++++------------------------
 1 file changed, 21 insertions(+), 50 deletions(-)

diff --git a/NLP/rnn_lstm_raw_inference.py b/NLP/rnn_lstm_raw_inference.py
index c5afc88..8200764 100644
--- a/NLP/rnn_lstm_raw_inference.py
+++ b/NLP/rnn_lstm_raw_inference.py
@@ -3,81 +3,52 @@
 import time
 import pandas as pd
 
-X_test = None
 
-
-  
-def load_test_batch(batch_size):
-  
-  global X_test
-  
-  num_words = 15000
-  maxlen = 130
-
-  (X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=num_words)
-  
-  X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test,
-                                                        value= 0,
-                                                        padding = 'pre',
-                                                        maxlen = maxlen )
+(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=15000)
   
-  test_batch = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)
+X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test,
+                                                       value= 0,
+                                                       padding = 'pre',
+                                                       maxlen = 130)
+print('X_test', X_test)
 
-  return test_batch
-  
-batch_size=1
-test_batch = load_test_batch(batch_size)
+batch_data = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(1)
 
 load_dataset_time = time.time()
-for i, (X_test_batch, y_test_batch) in enumerate(test_batch):
-    raw_data=X_test_batch
+for i, (X_batch_data, y_batch_data) in enumerate(batch_data):
+    raw_data=X_batch_data
     break
 load_dataset_time = time.time() - load_dataset_time
-
 print(raw_data)
 
-model = None
-load_model_time = None
 
-
-# 모델 로드
-def load_model(saved_model_dir):
-
-  global load_model_time
-  global model
-
-  load_model_time = time.time()
-  model = tf.keras.models.load_model(saved_model_dir)
-  load_model_time = time.time() - load_model_time
-
-
-# 모델이 저장되어있는/저장할 경로
 model_name = 'rnn_imdb'
 saved_model_dir=f'./model/{model_name}_model.h5'
 
-# 저장되어있는 모델이 있다면, 로드
-load_model_time = time.time()
-load_model(saved_model_dir)
-load_model_time = time.time() - load_model_time
+load_model_start_time = time.time()
+model = tf.keras.models.load_model(saved_model_dir)
+load_model_time = time.time() - load_model_start_time
+
 
 inference_time = time.time()
 pred_labels = []
 real_labels = []
 
 raw_inference_start = time.time()
-y_pred_batch = model(raw_data)
+y_pred = model(raw_data)
 raw_inference_time = time.time() - raw_inference_start
 
-# 배치 사이즈 만큼의 실제 라벨 저장
-real_labels.extend(y_test_batch.numpy())
-# 배치 사이즈 만큼의 예측 라벨 저장
-y_pred_batch = np.where(y_pred_batch > 0.5, 1, 0)
-y_pred_batch = y_pred_batch.reshape(-1)
-pred_labels.extend(y_pred_batch)
+real_labels.extend(y_batch_data.numpy())
+y_pred = np.where(y_pred > 0.5, 1, 0)
+y_pred = y_pred.reshape(-1)
+pred_labels.extend(y_pred)
 
 accuracy = np.sum(np.array(real_labels) == np.array(pred_labels))/len(real_labels)
 inference_time = time.time() - inference_time
 
+print('pred_labels', pred_labels)
+print('real_labels', real_labels)
+
 print('accuracy' , accuracy)
 print('load_model_time', load_model_time)
 print('load_dataset_time' , load_dataset_time)

From 96caa5362f122dd914097c5c89da1e196ce285d7 Mon Sep 17 00:00:00 2001
From: Jungae Park <46885199+jungae-park@users.noreply.github.com>
Date: Mon, 13 Mar 2023 12:00:37 +0900
Subject: [PATCH 6/7] Create bert_raw_inference.py

bert raw dataset inference code
---
 NLP/bert_raw_inference.py | 149 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 NLP/bert_raw_inference.py

diff --git a/NLP/bert_raw_inference.py b/NLP/bert_raw_inference.py
new file mode 100644
index 0000000..d9f514a
--- /dev/null
+++ b/NLP/bert_raw_inference.py
@@ -0,0 +1,149 @@
+import os
+import re
+import pickle
+import time
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import tensorflow_hub as hub
+from tensorflow.keras.utils import to_categorical
+from tensorflow.keras.models import  Model
+from tensorflow.keras.layers import Input, Dense, Dropout
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
+from tqdm import tqdm
+from tensorflow.keras.models import load_model
+
+MAX_SEQ_LEN = 500
+
+def load_directory_data(directory):
+  data = {}
+  data["sentence"] = []
+  data["sentiment"] = []
+  for file_path in os.listdir(directory):
+    with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
+      data["sentence"].append(f.read())
+      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
+  return pd.DataFrame.from_dict(data)
+
+
+def load_dataset(directory):
+  pos_df = load_directory_data(os.path.join(directory, "pos"))
+  neg_df = load_directory_data(os.path.join(directory, "neg"))
+  pos_df["polarity"] = "positive"
+  neg_df["polarity"] = "negative"
+  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
+
+
+def download_and_load_datasets(force_download=False):
+  dataset = tf.keras.utils.get_file(
+      fname="aclImdb.tar.gz",
+      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
+      extract=True)
+
+  train_df = load_dataset(os.path.join(os.path.dirname(dataset),
+                                       "aclImdb", "train"))
+  test_df = load_dataset(os.path.join(os.path.dirname(dataset),
+                                      "aclImdb", "test"))
+
+  return train_df, test_df
+
+def create_tonkenizer(bert_layer):
+    """Instantiate Tokenizer with vocab"""
+    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
+    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
+    tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)
+    print("Vocab size:", len(tokenizer.vocab))
+    return tokenizer
+
+def get_ids(tokens, tokenizer, MAX_SEQ_LEN):
+    """Token ids from Tokenizer vocab"""
+    token_ids = tokenizer.convert_tokens_to_ids(tokens)
+    input_ids = token_ids + [0] * (MAX_SEQ_LEN - len(token_ids))
+    return input_ids
+
+def get_masks(tokens, MAX_SEQ_LEN):
+    """Masks: 1 for real tokens and 0 for paddings"""
+    return [1] * len(tokens) + [0] * (MAX_SEQ_LEN - len(tokens))
+
+def get_segments(tokens, MAX_SEQ_LEN):
+    """Segments: 0 for the first sequence, 1 for the second"""
+    segments = []
+    current_segment_id = 0
+    for token in tokens:
+        segments.append(current_segment_id)
+        if token == "[SEP]":
+            current_segment_id = 1
+    return segments + [0] * (MAX_SEQ_LEN - len(tokens))
+
+def create_single_input(sentence, tokenizer, max_len):
+    """Create an input from a sentence"""
+    stokens = tokenizer.tokenize(sentence)
+    stokens = stokens[:max_len] 
+    stokens = ["[CLS]"] + stokens + ["[SEP]"]
+    return get_ids(stokens, tokenizer, max_len+2), get_masks(stokens, max_len+2), get_segments(stokens, max_len+2)
+
+def convert_sentences_to_features(sentences, tokenizer, MAX_SEQ_LEN):
+    """Convert sentences to features: input_ids, input_masks and input_segments"""
+    input_ids, input_masks, input_segments = [], [], []
+    for sentence in tqdm(sentences, position=0, leave=True):
+      ids, masks, segments = create_single_input(sentence, tokenizer, MAX_SEQ_LEN-2) 
+      input_ids.append(ids)
+      input_masks.append(masks)
+      input_segments.append(segments)
+    return [np.asarray(input_ids, dtype=np.int32), np.asarray(input_masks, dtype=np.int32), np.asarray(input_segments, dtype=np.int32)]
+
+
+
+def load_test_batch(batch_size):
+
+  x_test = None
+  y_test = None
+
+  with open(saved_dataset_dir+'_x_test.pkl','rb') as f:
+    x_test = pickle.load(f)
+  with open(saved_dataset_dir+'_y_test.pkl','rb') as f:
+    y_test = pickle.load(f)
+
+  test_batch = tf.data.Dataset.from_tensor_slices(((x_test[0],x_test[1],x_test[2]),y_test)).batch(batch_size)
+
+  return test_batch
+
+
+pred_labels = []
+real_labels = []
+
+model_name = 'bert_imdb'
+saved_model_dir=f'./model/{model_name}_model.h5'
+load_model_time = time.time()
+model = tf.keras.models.load_model(saved_model_dir, custom_objects={'KerasLayer': hub.KerasLayer})
+load_model_time = time.time() - load_model_time
+
+batch_size=1
+saved_dataset_dir=f'./dataset/{model_name}_dataset'
+test_batch = load_test_batch(batch_size)
+
+load_dataset_time = time.time()
+for i, (X_test_batch, y_test_batch) in enumerate(test_batch):
+    raw_data=X_test_batch
+    break
+load_dataset_time = time.time() - load_dataset_time
+
+
+inference_time = time.time()
+raw_inference_time = time.time()
+y_pred_batch = model(raw_data)
+raw_inference_time = time.time() - raw_inference_time
+real_labels.extend(np.argmax(y_pred_batch.numpy(), axis=1))
+pred_labels.extend(np.argmax(y_test_batch.numpy(), axis=1))
+accuracy = np.sum(np.array(real_labels) == np.array(pred_labels))/len(real_labels)
+inference_time = time.time() - inference_time
+
+
+print('accuracy',accuracy)
+print('load_model_time', load_model_time)
+print('load_dataset_time' , load_dataset_time)
+print('total_inference_time', inference_time)
+print('raw_inference_time', raw_inference_time / len(pred_labels))
+print('ips' , len(pred_labels) / (load_model_time + load_dataset_time + inference_time))
+print('ips(inf)' , len(pred_labels) / inference_time)

From 55bd44dc40b6fe24ab895a47c957496122bf2f61 Mon Sep 17 00:00:00 2001
From: Jungae Park <46885199+jungae-park@users.noreply.github.com>
Date: Mon, 13 Mar 2023 12:04:29 +0900
Subject: [PATCH 7/7] Create distrilbert_raw_inference.py

distrilbert raw dataset inference code
---
 NLP/distrilbert_raw_inference.py | 98 ++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 NLP/distrilbert_raw_inference.py

diff --git a/NLP/distrilbert_raw_inference.py b/NLP/distrilbert_raw_inference.py
new file mode 100644
index 0000000..a2c72cb
--- /dev/null
+++ b/NLP/distrilbert_raw_inference.py
@@ -0,0 +1,98 @@
+import transformers
+import datasets
+import tensorflow as tf
+import time
+import numpy as np
+import pandas as pd
+import tqdm
+
+
+def create_bert_input_features(tokenizer, docs, max_seq_length):
+
+    all_ids, all_masks = [], []
+    for doc in tqdm.tqdm(docs, desc="Converting docs to features"):
+        tokens = tokenizer.tokenize(doc)
+        if len(tokens) > max_seq_length-2:
+            tokens = tokens[0 : (max_seq_length-2)]
+        tokens = ['[CLS]'] + tokens + ['[SEP]']
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        masks = [1] * len(ids)
+        while len(ids) < max_seq_length:
+            ids.append(0)
+            masks.append(0)
+        all_ids.append(ids)
+        all_masks.append(masks)
+    encoded = np.array([all_ids, all_masks])
+    return encoded
+
+
+
+def load_test_batch(batch_size):
+
+  X_test = None
+  y_test=None
+
+  dataset = datasets.load_dataset("glue", "sst2")
+
+  X_test = np.array(dataset['validation']["sentence"])
+  y_test = np.array(dataset['validation']["label"])
+
+
+  tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+
+  MAX_SEQ_LENGTH = 128
+
+  val_features_ids, val_features_masks = create_bert_input_features(tokenizer, X_test,
+                                                                    max_seq_length=MAX_SEQ_LENGTH)
+  valid_ds = (
+    tf.data.Dataset
+    .from_tensor_slices(((val_features_ids, val_features_masks), y_test))
+    .batch(batch_size)
+    .prefetch(tf.data.experimental.AUTOTUNE)
+  )
+
+  return valid_ds
+
+
+model_name = 'distilbert_sst2'
+saved_model_dir=f'./model/{model_name}_model.h5'
+load_model_time = time.time()
+model = tf.keras.models.load_model(saved_model_dir,custom_objects={'TFDistilBertModel': transformers.TFDistilBertModel})
+load_model_time = time.time() - load_model_time
+
+
+batch_size=1
+valid_ds = load_test_batch(batch_size)
+
+load_dataset_time = time.time()
+for i, (X_test_batch, y_test_batch) in enumerate(valid_ds):
+    raw_data=X_test_batch
+    break
+load_dataset_time = time.time() - load_dataset_time
+
+
+pred_labels = []
+real_labels = []
+
+load_dataset_time = time.time()
+test_batch = load_test_batch(batch_size)
+load_dataset_time = time.time() - load_dataset_time
+
+inference_time = time.time()
+raw_inference_time = time.time()
+y_pred_batch = model(X_test_batch)
+raw_inference_time = time.time() - raw_inference_time
+real_labels.extend(y_test_batch.numpy())
+y_pred_batch = np.where(y_pred_batch > 0.5, 1, 0)
+y_pred_batch = y_pred_batch.reshape(-1)
+pred_labels.extend(y_pred_batch)
+accuracy = np.sum(np.array(real_labels) == np.array(pred_labels))/len(real_labels)
+inference_time = time.time() - inference_time
+
+print('accuracy', accuracy)
+print('load_model_time', load_model_time)
+print('load_dataset_time' , load_dataset_time)
+print('total_inference_time', inference_time)
+print('raw_inference_time', raw_inference_time / len(pred_labels))
+print('ips' , len(pred_labels) / (load_model_time + load_dataset_time + inference_time))
+print('ips(inf)' , len(pred_labels) / inference_time)