In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install transformers



In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import TFBertModel, BertTokenizer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow.keras.backend as K
import random
import nltk
import tqdm
import re, os

In [None]:
amzn = pd.read_csv('/content/drive/MyDrive/박선우/Data/preprocessing_data/Video_Games.csv')
print(amzn.shape)
amzn.head()

(581465, 5)


Unnamed: 0,reviewerID,asin,reviewText,overall,8000_words
0,A19GOZTT15KPG1,439381673,oredered daughter want play oregon trail year ...,5.0,daughter want play trail year ago blast laugh ...
1,A1441WFJ5KRP7J,439381673,well think since idem review would make get ga...,5.0,well think since review would make get game so...
2,ARNF05LJD98X2,439381673,though game older absolutely love nephew year ...,5.0,though game older absolutely love nephew year ...
3,A1HP7NVNPFMA4N,700026657,game bite hard get hang great,5.0,game bite hard get hang great
4,A1JGAP0185YJI6,700026657,play alright steam bite trouble move game stea...,4.0,play alright steam bite trouble move game stea...


In [None]:
amzn.loc[:, 'reviewText'] = amzn.loc[:, 'reviewText'].astype(str) # review 중 float 타입 존재

In [None]:
def get_review_length(train_text):

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_text)

    train_text = tokenizer.texts_to_sequences(train_text)


    sentence_length = []

    for l in train_text:
        sentence_length.append(len(l))
    max_length = int(np.quantile(sentence_length, 1.0))
    review_length_90 = int(np.quantile(sentence_length, 0.9))


    return max_length, review_length_90

In [None]:
max_length, review_length_90 = get_review_length(amzn['reviewText'])

In [None]:
max_length, review_length_90

(3004, 152)

In [None]:
tokenizer = Tokenizer()
max_len = 512

def tokenizing(tokenizer,df=None):
  '''
  전처리 완료된 processed_text를 불러와
  토큰화 및 각 토큰에 대해 정수 인코딩
  padding처리 된 부분 --> <PAD> 토큰 부여 및 0으로 값 지정
  '''
  train_review = df['reviewText'].astype(str)
  tokenizer.fit_on_texts(train_review)
  text_sequences = tokenizer.texts_to_sequences(train_review)

  word_vocab = tokenizer.word_index
  word_vocab['<PAD>'] = 0

  return text_sequences, word_vocab


def text_padding(max_len,df=None):
  '''
  text sequence의 길이 통일
  '''
  text_sequences, word_vocab = tokenizing(tokenizer,df)
  text_inputs = pad_sequences(text_sequences, maxlen=max_len, padding='post')
  print(f'shape of train_data : {text_inputs.shape}')

  return text_inputs, word_vocab


text_inputs, word_vocab = text_padding(max_len, amzn)

shape of train_data : (581465, 512)


In [None]:
data_configs = dict()
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab)

In [None]:
# Preprocessing for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir='bert_ckpt', do_lower_case=False)

def bert_tokenizer(sent, max_len):
  '''
  encode_plus를 통해 문장을 변환
  1. 문장 tokenizing
  2. add_special_tokens = True --> 토큰의 시작점에 ['CLS'] 토큰을, 토큰의 마지막에 ['SEP'] 토큰을 붙임
  3. 각 토큰을 인덱스로 변환
  4. max_length --> 문장 길이 통일, pad_to_max_length --> 패딩 적용
  5. return_attention_mask --> 어텐션 마스크 생성
  6. truncation=True --> 토큰 타입은 문장이 1개인 경우 0으로, 문장이 2개인 경우 0 및 1로 구분하여 생성
  '''
  encoded_dict = tokenizer.encode_plus(
                                       text = sent, add_special_tokens = True, max_length = max_len, pad_to_max_length = True,
                                       return_attention_mask = True, truncation=True
                                      )

  input_id = encoded_dict['input_ids']
  attention_mask = encoded_dict['attention_mask']
  token_type_id = encoded_dict['token_type_ids']

  return input_id, attention_mask, token_type_id

def clean_text(sent):
  sent_clean = re.sub('[^a-zA-Z0-9]', ' ',sent)
  return sent_clean

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
amzn_train, amzn_test = train_test_split(amzn, test_size=0.2, shuffle=True, random_state=42)
input_ids, attention_masks = [], []
token_type_ids, data_labels = [], []

for amzn_sent in tqdm.tqdm(amzn_train['reviewText'].astype(str), total=len(amzn_train)):    # amzn_train['Review_Text']
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(amzn_sent, 512)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)

    except Exception as e:
        print(e)
        print(amzn_sent)
        pass

100%|██████████| 465172/465172 [10:05<00:00, 768.51it/s]


In [None]:
amzn_train_input_ids = np.array(input_ids, dtype=int)
amzn_train_input_ids

array([[  101, 14212, 15493, ...,     0,     0,     0],
       [  101,  4031,  6235, ...,     0,     0,     0],
       [  101,  2521,  5409, ...,     0,     0,     0],
       ...,
       [  101,  2428,  2066, ...,     0,     0,     0],
       [  101,  5470,  7986, ...,     0,     0,     0],
       [  101, 25546,  7646, ...,     0,     0,     0]])

In [None]:
amzn_train_input_ids = np.array(input_ids, dtype=int)
amzn_train_attention_masks = np.array(attention_masks, dtype=int)
amzn_train_type_ids = np.array(token_type_ids, dtype=int)
amzn_train_inputs = (amzn_train_input_ids, amzn_train_attention_masks, amzn_train_type_ids)

np.save(open('/content/drive/MyDrive/BERT_Encoding/Video_Games_bert_train_input_ids.npy','wb'), amzn_train_input_ids)
np.save(open('/content/drive/MyDrive/BERT_Encoding/Video_Games_bert_train_attention_masks.npy','wb'), amzn_train_attention_masks)
np.save(open('/content/drive/MyDrive/BERT_Encoding/Video_Games_bert_train_type_ids.npy', 'wb'), amzn_train_type_ids)

In [None]:
input_ids, attention_masks = [], []
token_type_ids, data_labels = [], []

for amzn_sent in tqdm.tqdm(amzn_test['reviewText'].astype(str), total=len(amzn_train)):    # amzn_train['Review_Text']
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(amzn_sent, 512)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)

    except Exception as e:
        print(e)
        print(amzn_sent)
        pass

amzn_test_input_ids = np.array(input_ids, dtype=int)
amzn_test_attention_masks = np.array(attention_masks, dtype=int)
amzn_test_type_ids = np.array(token_type_ids, dtype=int)
amzn_test_inputs = (amzn_test_input_ids, amzn_test_attention_masks, amzn_test_type_ids)

 25%|██▌       | 116293/465172 [02:28<07:26, 781.04it/s]


In [None]:
np.save(open('/content/drive/MyDrive/BERT_Encoding/Video_Games_bert_test_input_ids.npy','wb'), amzn_test_input_ids)
np.save(open('/content/drive/MyDrive/BERT_Encoding/Video_Games_bert_test_attention_masks.npy','wb'), amzn_test_attention_masks)
np.save(open('/content/drive/MyDrive/BERT_Encoding/Video_Games_bert_test_type_ids.npy', 'wb'), amzn_test_type_ids)

In [None]:
print(np.shape(amzn_test_input_ids), np.shape(amzn_test))

(116293, 512) (116293, 5)
