diff --git a/CNN/nlp_imdb_review_inference.py b/CNN/nlp_imdb_review_inference.py new file mode 100644 index 0000000..3f542fa --- /dev/null +++ b/CNN/nlp_imdb_review_inference.py @@ -0,0 +1,40 @@ +import tensorflow as tf +import numpy as np +import re +from tensorflow.keras.datasets import imdb +from tensorflow.keras.preprocessing.sequence import pad_sequences + +model = tf.keras.models.load_model('./model/rnn_imdb_model.h5') + +negative_input = "This movie was just way too overrated. The fighting was not professional and in slow motion. I was expecting more from a 200 million budget movie. The little sister of T.Challa was just trying too hard to be funny. The story was really dumb as well. Don't watch this movie if you are going because others say its great unless you are a Black Panther fan or Marvels fan." + +positive_input = "I was lucky enough to be included in the group to see the advanced screening in Melbourne on the 15th of April, 2012. And, firstly, I need to say a big thank-you to Disney and Marvel Studios. \ +Now, the film... how can I even begin to explain how I feel about this film? It is, as the title of this review says a 'comic book triumph'. I went into the film with very, very high expectations and I was not disappointed. \ +Seeing Joss Whedon's direction and envisioning of the film come to life on the big screen is perfect. The script is amazingly detailed and laced with sharp wit a humor. The special effects are literally mind-blowing and the action scenes are both hard-hitting and beautifully choreographed." + +word_to_index = imdb.get_word_index() + +# 알파벳과 숫자를 제외하고 모두 제거 및 알파벳 소문자화 +new_sentence = re.sub('[^0-9a-zA-Z ]', '', positive_input).lower() +encoded = [] + +# 띄어쓰기 단위 토큰화 후 정수 인코딩 +for word in new_sentence.split(): + try : + # 단어 집합의 크기를 10,000으로 제한. + if word_to_index[word] <= 10000: + encoded.append(word_to_index[word]+3) + else: + # 10,000 이상의 숫자는 토큰으로 변환. + encoded.append(2) + # 단어 집합에 없는 단어는 토큰으로 변환. + except KeyError: + encoded.append(2) + +pad_sequence = pad_sequences([encoded], maxlen=500) +score = float(model.predict(pad_sequence)) # 예측 + +if(score > 0.5): + print("{:.2f}% 확률로 긍정 리뷰입니다.".format(score * 100)) +else: + print("{:.2f}% 확률로 부정 리뷰입니다.".format((1 - score) * 100)) diff --git a/NLP/bert_raw_inference.py b/NLP/bert_raw_inference.py new file mode 100644 index 0000000..d9f514a --- /dev/null +++ b/NLP/bert_raw_inference.py @@ -0,0 +1,149 @@ +import os +import re +import pickle +import time +import numpy as np +import pandas as pd +import tensorflow as tf +import tensorflow_hub as hub +from tensorflow.keras.utils import to_categorical +from tensorflow.keras.models import Model +from tensorflow.keras.layers import Input, Dense, Dropout +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard +from tqdm import tqdm +from tensorflow.keras.models import load_model + +MAX_SEQ_LEN = 500 + +def load_directory_data(directory): + data = {} + data["sentence"] = [] + data["sentiment"] = [] + for file_path in os.listdir(directory): + with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f: + data["sentence"].append(f.read()) + data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1)) + return pd.DataFrame.from_dict(data) + + +def load_dataset(directory): + pos_df = load_directory_data(os.path.join(directory, "pos")) + neg_df = load_directory_data(os.path.join(directory, "neg")) + pos_df["polarity"] = "positive" + neg_df["polarity"] = "negative" + return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True) + + +def download_and_load_datasets(force_download=False): + dataset = tf.keras.utils.get_file( + fname="aclImdb.tar.gz", + origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", + extract=True) + + train_df = load_dataset(os.path.join(os.path.dirname(dataset), + "aclImdb", "train")) + test_df = load_dataset(os.path.join(os.path.dirname(dataset), + "aclImdb", "test")) + + return train_df, test_df + +def create_tonkenizer(bert_layer): + """Instantiate Tokenizer with vocab""" + vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() + do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() + tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case) + print("Vocab size:", len(tokenizer.vocab)) + return tokenizer + +def get_ids(tokens, tokenizer, MAX_SEQ_LEN): + """Token ids from Tokenizer vocab""" + token_ids = tokenizer.convert_tokens_to_ids(tokens) + input_ids = token_ids + [0] * (MAX_SEQ_LEN - len(token_ids)) + return input_ids + +def get_masks(tokens, MAX_SEQ_LEN): + """Masks: 1 for real tokens and 0 for paddings""" + return [1] * len(tokens) + [0] * (MAX_SEQ_LEN - len(tokens)) + +def get_segments(tokens, MAX_SEQ_LEN): + """Segments: 0 for the first sequence, 1 for the second""" + segments = [] + current_segment_id = 0 + for token in tokens: + segments.append(current_segment_id) + if token == "[SEP]": + current_segment_id = 1 + return segments + [0] * (MAX_SEQ_LEN - len(tokens)) + +def create_single_input(sentence, tokenizer, max_len): + """Create an input from a sentence""" + stokens = tokenizer.tokenize(sentence) + stokens = stokens[:max_len] + stokens = ["[CLS]"] + stokens + ["[SEP]"] + return get_ids(stokens, tokenizer, max_len+2), get_masks(stokens, max_len+2), get_segments(stokens, max_len+2) + +def convert_sentences_to_features(sentences, tokenizer, MAX_SEQ_LEN): + """Convert sentences to features: input_ids, input_masks and input_segments""" + input_ids, input_masks, input_segments = [], [], [] + for sentence in tqdm(sentences, position=0, leave=True): + ids, masks, segments = create_single_input(sentence, tokenizer, MAX_SEQ_LEN-2) + input_ids.append(ids) + input_masks.append(masks) + input_segments.append(segments) + return [np.asarray(input_ids, dtype=np.int32), np.asarray(input_masks, dtype=np.int32), np.asarray(input_segments, dtype=np.int32)] + + + +def load_test_batch(batch_size): + + x_test = None + y_test = None + + with open(saved_dataset_dir+'_x_test.pkl','rb') as f: + x_test = pickle.load(f) + with open(saved_dataset_dir+'_y_test.pkl','rb') as f: + y_test = pickle.load(f) + + test_batch = tf.data.Dataset.from_tensor_slices(((x_test[0],x_test[1],x_test[2]),y_test)).batch(batch_size) + + return test_batch + + +pred_labels = [] +real_labels = [] + +model_name = 'bert_imdb' +saved_model_dir=f'./model/{model_name}_model.h5' +load_model_time = time.time() +model = tf.keras.models.load_model(saved_model_dir, custom_objects={'KerasLayer': hub.KerasLayer}) +load_model_time = time.time() - load_model_time + +batch_size=1 +saved_dataset_dir=f'./dataset/{model_name}_dataset' +test_batch = load_test_batch(batch_size) + +load_dataset_time = time.time() +for i, (X_test_batch, y_test_batch) in enumerate(test_batch): + raw_data=X_test_batch + break +load_dataset_time = time.time() - load_dataset_time + + +inference_time = time.time() +raw_inference_time = time.time() +y_pred_batch = model(raw_data) +raw_inference_time = time.time() - raw_inference_time +real_labels.extend(np.argmax(y_pred_batch.numpy(), axis=1)) +pred_labels.extend(np.argmax(y_test_batch.numpy(), axis=1)) +accuracy = np.sum(np.array(real_labels) == np.array(pred_labels))/len(real_labels) +inference_time = time.time() - inference_time + + +print('accuracy',accuracy) +print('load_model_time', load_model_time) +print('load_dataset_time' , load_dataset_time) +print('total_inference_time', inference_time) +print('raw_inference_time', raw_inference_time / len(pred_labels)) +print('ips' , len(pred_labels) / (load_model_time + load_dataset_time + inference_time)) +print('ips(inf)' , len(pred_labels) / inference_time) diff --git a/NLP/distrilbert_raw_inference.py b/NLP/distrilbert_raw_inference.py new file mode 100644 index 0000000..a2c72cb --- /dev/null +++ b/NLP/distrilbert_raw_inference.py @@ -0,0 +1,98 @@ +import transformers +import datasets +import tensorflow as tf +import time +import numpy as np +import pandas as pd +import tqdm + + +def create_bert_input_features(tokenizer, docs, max_seq_length): + + all_ids, all_masks = [], [] + for doc in tqdm.tqdm(docs, desc="Converting docs to features"): + tokens = tokenizer.tokenize(doc) + if len(tokens) > max_seq_length-2: + tokens = tokens[0 : (max_seq_length-2)] + tokens = ['[CLS]'] + tokens + ['[SEP]'] + ids = tokenizer.convert_tokens_to_ids(tokens) + masks = [1] * len(ids) + while len(ids) < max_seq_length: + ids.append(0) + masks.append(0) + all_ids.append(ids) + all_masks.append(masks) + encoded = np.array([all_ids, all_masks]) + return encoded + + + +def load_test_batch(batch_size): + + X_test = None + y_test=None + + dataset = datasets.load_dataset("glue", "sst2") + + X_test = np.array(dataset['validation']["sentence"]) + y_test = np.array(dataset['validation']["label"]) + + + tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased') + + MAX_SEQ_LENGTH = 128 + + val_features_ids, val_features_masks = create_bert_input_features(tokenizer, X_test, + max_seq_length=MAX_SEQ_LENGTH) + valid_ds = ( + tf.data.Dataset + .from_tensor_slices(((val_features_ids, val_features_masks), y_test)) + .batch(batch_size) + .prefetch(tf.data.experimental.AUTOTUNE) + ) + + return valid_ds + + +model_name = 'distilbert_sst2' +saved_model_dir=f'./model/{model_name}_model.h5' +load_model_time = time.time() +model = tf.keras.models.load_model(saved_model_dir,custom_objects={'TFDistilBertModel': transformers.TFDistilBertModel}) +load_model_time = time.time() - load_model_time + + +batch_size=1 +valid_ds = load_test_batch(batch_size) + +load_dataset_time = time.time() +for i, (X_test_batch, y_test_batch) in enumerate(valid_ds): + raw_data=X_test_batch + break +load_dataset_time = time.time() - load_dataset_time + + +pred_labels = [] +real_labels = [] + +load_dataset_time = time.time() +test_batch = load_test_batch(batch_size) +load_dataset_time = time.time() - load_dataset_time + +inference_time = time.time() +raw_inference_time = time.time() +y_pred_batch = model(X_test_batch) +raw_inference_time = time.time() - raw_inference_time +real_labels.extend(y_test_batch.numpy()) +y_pred_batch = np.where(y_pred_batch > 0.5, 1, 0) +y_pred_batch = y_pred_batch.reshape(-1) +pred_labels.extend(y_pred_batch) +accuracy = np.sum(np.array(real_labels) == np.array(pred_labels))/len(real_labels) +inference_time = time.time() - inference_time + +print('accuracy', accuracy) +print('load_model_time', load_model_time) +print('load_dataset_time' , load_dataset_time) +print('total_inference_time', inference_time) +print('raw_inference_time', raw_inference_time / len(pred_labels)) +print('ips' , len(pred_labels) / (load_model_time + load_dataset_time + inference_time)) +print('ips(inf)' , len(pred_labels) / inference_time) diff --git a/NLP/rnn_lstm_raw_inference.py b/NLP/rnn_lstm_raw_inference.py new file mode 100644 index 0000000..8200764 --- /dev/null +++ b/NLP/rnn_lstm_raw_inference.py @@ -0,0 +1,58 @@ +import tensorflow as tf +import numpy as np +import time +import pandas as pd + + +(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=15000) + +X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, + value= 0, + padding = 'pre', + maxlen = 130) +print('X_test', X_test) + +batch_data = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(1) + +load_dataset_time = time.time() +for i, (X_batch_data, y_batch_data) in enumerate(batch_data): + raw_data=X_batch_data + break +load_dataset_time = time.time() - load_dataset_time +print(raw_data) + + +model_name = 'rnn_imdb' +saved_model_dir=f'./model/{model_name}_model.h5' + +load_model_start_time = time.time() +model = tf.keras.models.load_model(saved_model_dir) +load_model_time = time.time() - load_model_start_time + + +inference_time = time.time() +pred_labels = [] +real_labels = [] + +raw_inference_start = time.time() +y_pred = model(raw_data) +raw_inference_time = time.time() - raw_inference_start + +real_labels.extend(y_batch_data.numpy()) +y_pred = np.where(y_pred > 0.5, 1, 0) +y_pred = y_pred.reshape(-1) +pred_labels.extend(y_pred) + +accuracy = np.sum(np.array(real_labels) == np.array(pred_labels))/len(real_labels) +inference_time = time.time() - inference_time + +print('pred_labels', pred_labels) +print('real_labels', real_labels) + +print('accuracy' , accuracy) +print('load_model_time', load_model_time) +print('load_dataset_time' , load_dataset_time) +print('total_inference_time', inference_time) +print('raw_inference_time', raw_inference_time / len(pred_labels)) +print('ips' , len(pred_labels) / (load_model_time + load_dataset_time + inference_time)) +print('ips(inf)' , len(pred_labels) / inference_time)