In [None]:
# 전역 변수 설정
DATA_SIZE=1000
VOCAB_SIZE = 10000  # 단어 사전 크기
MAX_LEN = 200       # 패딩할 최대 문장 길이

In [None]:
import pandas as pd

df = pd.read_csv("./data/imdb.csv") # https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
df = df.sample(n=DATA_SIZE, random_state=42).reset_index() # 랜덤 1000개 추출
df = df.drop("index", axis=1)

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     1000 non-null   object
 1   sentiment  1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB
None
                                              review sentiment
0  I really liked this Summerslam due to the look...  positive
1  Not many television shows appeal to quite as m...  positive
2  The film quickly gets to a major chase scene w...  negative
3  Jane Austen would definitely approve of this o...  positive
4  Expectations were somewhat high for me when I ...  negative


In [2]:
# HTML 태그 제거
import re
import html # HTML 엔티티 처리를 위해 import

def remove_html_tags_regex(text):
    # 1. HTML 태그 제거: <로 시작해서 >로 끝나는 모든 것을 찾음
    # <.*?> : . (모든 문자)가 * (0번 이상) 반복되는데, ? (non-greedy)
    # ?가 없으면 "<b>text</b>" 전체를 태그로 인식할 수 있음
    pattern = re.compile('<.*?>')
    cleaned_text = re.sub(pattern, ' ', text) # 태그를 공백(' ')으로 치환
    
    # 2. HTML 엔티티 변환 (e.g., &nbsp; -> 공백, &lt; -> <)
    cleaned_text = html.unescape(cleaned_text)
    
    # 3. 여러 개의 공백을 하나로 합침
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

df['review'] = df['review'].apply(remove_html_tags_regex)
print(df["review"][0])

I really liked this Summerslam due to the look of the arena, the curtains and just the look overall was interesting to me for some reason. Anyways, this could have been one of the best Summerslam's ever if the WWF didn't have Lex Luger in the main event against Yokozuna, now for it's time it was ok to have a huge fat man vs a strong man but I'm glad times have changed. It was a terrible main event just like every match Luger is in is terrible. Other matches on the card were Razor Ramon vs Ted Dibiase, Steiner Brothers vs Heavenly Bodies, Shawn Michaels vs Curt Hening, this was the event where Shawn named his big monster of a body guard Diesel, IRS vs 1-2-3 Kid, Bret Hart first takes on Doink then takes on Jerry Lawler and stuff with the Harts and Lawler was always very interesting, then Ludvig Borga destroyed Marty Jannetty, Undertaker took on Giant Gonzalez in another terrible match, The Smoking Gunns and Tatanka took on Bam Bam Bigelow and the Headshrinkers, and Yokozuna defended the

In [3]:
# 불용어 제거
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# 영어 불용어 리스트 로드
stop_words = set(stopwords.words('english'))

# 텍스트 정제 및 불용어 제거를 한 번에 처리하는 함수
def clean_and_remove_stopwords(text):
    
    # 소문자 변환 및 정제 (알파벳, 공백, '만 남김)
    text = text.lower()
    text = re.sub(r'[^a-z\s\']', '', text) 
    
    # 토큰화 (띄어쓰기 기준)
    tokens = text.split()
    
    # 불용어 제거
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # 다시 하나의 문자열로 합치기
    return ' '.join(filtered_tokens)

print("전처리 시작...")
df['review'] = df['review'].apply(clean_and_remove_stopwords)

print("전처리 완료.")
print("\n--- 전처리 후 샘플 ---")
print(df['review'][0])

전처리 시작...
전처리 완료.

--- 전처리 후 샘플 ---
really liked summerslam due look arena curtains look overall interesting reason anyways could one best summerslam's ever wwf lex luger main event yokozuna time ok huge fat man vs strong man glad times changed terrible main event like every match luger terrible matches card razor ramon vs ted dibiase steiner brothers vs heavenly bodies shawn michaels vs curt hening event shawn named big monster body guard diesel irs vs kid bret hart first takes doink takes jerry lawler stuff harts lawler always interesting ludvig borga destroyed marty jannetty undertaker took giant gonzalez another terrible match smoking gunns tatanka took bam bam bigelow headshrinkers yokozuna defended world title lex luger match boring terrible ending however deserves


[nltk_data] Downloading package stopwords to /Users/cmjcm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# 1. TextVectorization 레이어 초기화
vectorize_layer = TextVectorization(
    standardize='lower_and_strip_punctuation', # 1. 소문자 변환, 2. 구두점 제거
    split='whitespace',                        # 3. 띄어쓰기 기준 토큰화
    max_tokens=VOCAB_SIZE,                     # 4. 단어 사전 크기 (OOK 토큰 포함)
    output_mode='int',                         # 5. 정수 인코딩
    output_sequence_length=MAX_LEN             # 6. 패딩 (길이 통일)
)

# 2. 전처리된 텍스트 데이터로 단어 사전(vocabulary) 생성
print("TextVectorization 레이어 어휘 학습(adapt) 시작...")
vectorize_layer.adapt(df['review'])
print("어휘 학습 완료.")

TextVectorization 레이어 어휘 학습(adapt) 시작...
어휘 학습 완료.


In [7]:
# 결과 확인하기

import numpy as np

# 1. 원본 텍스트 샘플 준비 (수동 전처리 완료된 상태)
sample_texts = df['review'].head(3).tolist()
print("--- [1] 원본 텍스트 샘플 ---")
for text in sample_texts:
    print(text)

# 2. vectorize_layer에 샘플 텍스트 통과
# (레이어 자체를 함수처럼 사용)
vectorized_output = vectorize_layer(sample_texts)

print("\n--- [2] Vectorize 결과 (정수 인코딩 + 패딩) ---")
print(vectorized_output)

# 3. numpy 배열로 변환하면 더 보기 편함
print("\n--- [3] Numpy 배열로 변환 ---")
print(vectorized_output.numpy())

# 4. 첫 번째 문장의 결과 확인 (MAX_LEN=200 가정)
print("\n--- [4] 첫 번째 문장 결과 (앞 20개, 뒤 20개) ---")
first_result = vectorized_output.numpy()[0]
print("앞부분:", first_result[:20])
print("뒷부분:", first_result[-20:])

--- [1] 원본 텍스트 샘플 ---
really liked summerslam due look arena curtains look overall interesting reason anyways could one best summerslam's ever wwf lex luger main event yokozuna time ok huge fat man vs strong man glad times changed terrible main event like every match luger terrible matches card razor ramon vs ted dibiase steiner brothers vs heavenly bodies shawn michaels vs curt hening event shawn named big monster body guard diesel irs vs kid bret hart first takes doink takes jerry lawler stuff harts lawler always interesting ludvig borga destroyed marty jannetty undertaker took giant gonzalez another terrible match smoking gunns tatanka took bam bam bigelow headshrinkers yokozuna defended world title lex luger match boring terrible ending however deserves
many television shows appeal quite many different kinds fans like farscape doesi know youngsters years oldfans male female many different countries think adore tv miniseries elements found almost every show tv character driven drama

In [8]:
# 레이어의 어휘 사전(단어 목록) 가져오기
vocab = vectorize_layer.get_vocabulary()

print(f"--- 총 어휘 사전 크기 (VOCAB_SIZE) ---")
print(len(vocab))

print("\n--- [1] 앞부분 단어 20개 ---")
# 0번: 패딩(PAD) 토큰 (표시는 안 될 수 있음)
# 1번: OOV(Out-of-Vocabulary) 토큰 ([UNK])
print(vocab[:20])

print("\n--- [2] 뒷부분 단어 20개 (빈도가 낮은 단어들) ---")
print(vocab[-20:])

print("\n--- [3] 'movie'라는 단어의 인덱스(정수) 찾기 ---")
try:
    print(vocab.index('movie'))
except ValueError:
    print("'movie'는 어휘 사전에 없습니다. (또는 VOCAB_SIZE 밖에 있음)")

--- 총 어휘 사전 크기 (VOCAB_SIZE) ---
10000

--- [1] 앞부분 단어 20개 ---
['', '[UNK]', 'movie', 'film', 'one', 'like', 'good', 'see', 'even', 'time', 'really', 'would', 'story', 'get', 'bad', 'much', 'people', 'well', 'first', 'great']

--- [2] 뒷부분 단어 20개 (빈도가 낮은 단어들) ---
['udo', 'ubisoft', 'uberrare', 'ubercoldness', 'tyrone', 'tyre', 'tyrant', 'typos', 'typo', 'typescooper', 'typecasting', 'tylos', 'tyloat', 'tykwer', 'tykes', 'tying', 'tyd', 'twos', 'tworeel', 'twoparter']

--- [3] 'movie'라는 단어의 인덱스(정수) 찾기 ---
2


In [None]:
import tensorflow as tf

# 2. 레이어를 내보내기(export) 위한 더미 모델 생성
# TextVectorization 레이어만 포함하는 모델을 만듭니다.
model_for_export = tf.keras.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string),
    vectorize_layer
])

# 3. 모델 저장 (이 폴더를 압축해서 전달)
model_for_export.save('model/vectorizer_layer_model.keras')

# load_model('model/vectorizer_layer_model') 을 통해 불러올 수 있습니다.