In [18]:
import tensorflow as tf
from tensorflow.keras.utils import plot_model, text_dataset_from_directory
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import InputLayer, Dense, Flatten, Conv2D, MaxPool2D, Dropout, Embedding, GlobalAveragePooling1D, Activation
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy, BinaryCrossentropy

import os
import re
import shutil
import string
import numpy as np
import matplotlib.pyplot as plt

In [2]:
print('tensorflow version is {}'.format(tf.__version__))

tensorflow version is 2.9.0


In [3]:
file='stack_overflow_16k'
url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'

In [5]:
# # 뭔가 이상함, cache_subdir=file로 한 후 '' 해야 directory 구조가 원하는 대로 됨
dataset = tf.keras.utils.get_file(file, origin=url, untar=True, cache_dir='.' , cache_subdir=file)

In [6]:
# # 뭔가 이상함, cache_subdir=file로 한 후 '' 해야 directory 구조가 원하는 대로 됨
dataset = tf.keras.utils.get_file(file, origin=url, untar=True, cache_dir='.' , cache_subdir='')

In [7]:
dataset

'./stack_overflow_16k'

In [11]:
os.listdir(dataset)

['test', 'stack_overflow_16k.tar.gz', 'README.md', 'train']

In [12]:
train_dir = os.path.join(dataset, 'train')

In [13]:
test_dir = os.path.join(dataset, 'test')

In [16]:
train_dir

'./stack_overflow_16k/train'

In [20]:
seed = 111
batch_size = 32

In [22]:
raw_train = text_dataset_from_directory(
    train_dir, batch_size=batch_size, validation_split=0.2, subset='training', seed=seed)
raw_val = text_dataset_from_directory(
    train_dir, batch_size=batch_size, validation_split=0.2, subset='validation', seed=seed)
raw_test = text_dataset_from_directory(
    test_dir, batch_size=batch_size)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.
Found 8000 files belonging to 4 classes.
Using 1600 files for validation.
Found 8000 files belonging to 4 classes.


In [23]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [24]:
max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [26]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [27]:
text_batch, label_batch = next(iter(raw_train))

2022-06-15 08:19:10.729291: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [40]:
first_review = text_batch[0]
first_label = label_batch[0]
print('first riview: ', first_review.numpy())
print('first label: ', raw_train.class_names[first_label])

first riview:  b'"comparing 2 int arrays can anyone tell me whats wrong in this method and why it gives a nullpointerexception ?..public boolean check(){..    scanner scan = new scanner(system.in);..    int[] arr1 = new int []{1,2,3};.    int[] arr2 = new int[]{};..    for(int i = 0;i&lt;arr1.length;i++).    {.        system.out.println(""enter numbers to check"");.        arr2[i] = scan.nextint();.    }..    if(arr1 == arr2).        return true;..    return false;.}"\n'
first label:  java


In [48]:
text_for_word_tabel = raw_train.map(lambda x, y: x)
vectorize_layer.adapt(text_for_word_tabel)

2022-06-15 12:35:12.831138: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [49]:
vectorize_text(first_review, first_label)

(<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
 array([[1031,   59,   29,  657,   34,  289,  414,   74,  476,  149,    7,
           13,   66,    8,  109,   10,  400,    5, 1790,   19,  264,  184,
          245, 1267,   15,  442,   29,    1,   15,   29, 1118,   29, 5994,
           15,   29,  410,    3,    1,  547,  188,    4,  184,    1, 2154,
            1, 5994,   25,   91,   25,  101,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0, 

In [50]:
first_label

<tf.Tensor: shape=(), dtype=int32, numpy=1>