In [48]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
import pandas as pd
import tensorflow_datasets as tfds
import tensorflow.keras as keras
from keras import layers
from tensorflow.keras.layers import LSTM
import pickle
from collections import OrderedDict
import json
import re


def replace_polish_characters(text):
    replacements = {
        'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n', 'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z',
        'Ą': 'A', 'Ć': 'C', 'Ę': 'E', 'Ł': 'L', 'Ń': 'N', 'Ó': 'O', 'Ś': 'S', 'Ź': 'Z', 'Ż': 'Z'
    }
    for polish_char, replacement in replacements.items():
        text = tf.strings.regex_replace(text, polish_char, replacement)
    return text


def clean_text(text):
    text = replace_polish_characters(text)
    text = tf.strings.regex_replace(text, r'[^a-zA-Z0-9\s]', '')
    return text

# Load dataset
ds = tf.data.TextLineDataset("Dataset2.csv")

ds_size = sum(1 for _ in ds) # dataset size

ds = ds.shuffle(buffer_size=ds_size, reshuffle_each_iteration=False)

train_size = int(0.90 * ds_size)

ds_train = ds.take(train_size)
ds_test = ds.skip(train_size)

print("Liczność zbioru treningowego:", sum(1 for _ in ds_train))
print("Liczność zbioru testowego:", sum(1 for _ in ds_test))

tokenizer = tfds.deprecated.text.Tokenizer()

def build_vocabulary(ds_train, treshhold=2):
    frequencies = {}
    vocabulary = set()
    vocabulary.update(["sostoken"])
    vocabulary.update(["eostoken"])
    
    for line in ds_train.skip(1):
        split_line = tf.strings.split(line, ";", maxsplit=2)
        article = split_line[2]
        clean_article = clean_text(tf.strings.lower(article))
        tokenized_text = tokenizer.tokenize(tf.get_static_value(clean_article))
        
        for word in tokenized_text:
            if word not in frequencies:
                frequencies[word] = 1
            else:
                frequencies[word] += 1
            
            if frequencies[word] == treshhold:
                vocabulary.update(tokenized_text)
        
    return vocabulary

# Create and save vocabulary
#vocabulary = build_vocabulary(ds_train)

#with open("vocabulary.json", "w") as vocab_file:
#    json.dump(list(vocabulary), vocab_file)
    
with open("vocabulary.json", "r") as vocab_file:
    vocabulary = json.load(vocab_file)

encoder = tfds.deprecated.text.TokenTextEncoder(
    list(vocabulary), oov_token="<UNK>", lowercase=True, tokenizer=tokenizer,
)

def encode_map_fn(line):
    split_line = tf.strings.split(line, ";", maxsplit=2)
    label_str = split_line[1] # AI, Human
    article = "sostoken " + split_line[2] + " eostoken"
    clean_article = clean_text(tf.strings.lower(article))
    label = tf.cond(tf.equal(label_str, "ai"), lambda: tf.constant(1), lambda: tf.constant(0))
    
    encoded_text, label = tf.py_function(
        my_encoder, 
        inp=[clean_article, label], 
        Tout=(tf.int64, tf.int32)
    )
    
    encoded_text.set_shape([None])
    label.set_shape([])
    
    return encoded_text, label

def my_encoder(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy().decode('utf-8'))
    return encoded_text, label

AUTOTUNE = tf.data.experimental.AUTOTUNE
ds_train = ds_train.map(encode_map_fn, num_parallel_calls=AUTOTUNE).cache()
ds_train = ds_train.shuffle(300)
ds_train = ds_train.padded_batch(32, padded_shapes=([None],()))

ds_test = ds_test.map(encode_map_fn)
ds_test = ds_test.padded_batch(32, padded_shapes=([None], ()))

model = keras.Sequential(
    [
        layers.Embedding(input_dim=len(vocabulary)+2, output_dim=32),
        layers.Bidirectional(LSTM(256, return_sequences=True)),
        layers.Dropout(0.5),
        layers.Bidirectional(LSTM(128, return_sequences=True)),
        #layers.Dropout(0.5),
        #layers.Bidirectional(LSTM(64)),
       # layers.Bidirectional(LSTM(32, return_sequences=True)),
        layers.Dropout(0.5),
        layers.GlobalAveragePooling1D(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='sigmoid'),
    ]
)

model.compile(
    loss=keras.losses.BinaryCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"],
)

model.fit(ds_train, epochs=20, verbose=2) 
model.evaluate(ds_test)


Liczność zbioru treningowego: 1233
Liczność zbioru testowego: 138
Epoch 1/20
39/39 - 56s - 1s/step - accuracy: 0.4728 - loss: 0.6781
Epoch 2/20
39/39 - 43s - 1s/step - accuracy: 0.5069 - loss: 0.6780
Epoch 3/20
39/39 - 46s - 1s/step - accuracy: 0.5053 - loss: 0.6786
Epoch 4/20
39/39 - 47s - 1s/step - accuracy: 0.5142 - loss: 0.6774
Epoch 5/20
39/39 - 48s - 1s/step - accuracy: 0.5215 - loss: 0.6751
Epoch 6/20
39/39 - 46s - 1s/step - accuracy: 0.5580 - loss: 0.6711
Epoch 7/20
39/39 - 44s - 1s/step - accuracy: 0.5839 - loss: 0.6321
Epoch 8/20
39/39 - 45s - 1s/step - accuracy: 0.6699 - loss: 0.5873
Epoch 9/20
39/39 - 46s - 1s/step - accuracy: 0.7802 - loss: 0.5494
Epoch 10/20
39/39 - 45s - 1s/step - accuracy: 0.6732 - loss: 0.6099
Epoch 11/20
39/39 - 47s - 1s/step - accuracy: 0.5069 - loss: 0.6817
Epoch 12/20
39/39 - 49s - 1s/step - accuracy: 0.4964 - loss: 0.6771
Epoch 13/20
39/39 - 48s - 1s/step - accuracy: 0.5028 - loss: 0.6692
Epoch 14/20
39/39 - 49s - 1s/step - accuracy: 0.5296 - loss

[0.5091013312339783, 0.7898550629615784]

In [36]:
tf.keras.models.save_model(model, 'GPDT(94).keras') #zapis

In [28]:
# Load the saved model
model = tf.keras.models.load_model('model_testowy(90%).keras')

# Iterate through the layers and print their configuration
for layer in model.layers:
    print(layer.name, layer.get_config())


embedding {'name': 'embedding', 'trainable': True, 'dtype': 'float32', 'input_dim': 20675, 'output_dim': 32, 'embeddings_initializer': {'module': 'keras.initializers', 'class_name': 'RandomUniform', 'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None}, 'registered_name': None}, 'embeddings_regularizer': None, 'activity_regularizer': None, 'embeddings_constraint': None, 'mask_zero': False}
lstm {'name': 'lstm', 'trainable': True, 'dtype': 'float32', 'return_sequences': True, 'return_state': False, 'go_backwards': False, 'stateful': False, 'unroll': False, 'zero_output_for_mask': False, 'units': 256, 'activation': 'tanh', 'recurrent_activation': 'sigmoid', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'recurrent_initializer': {'module': 'keras.initializers', 'class_name': 'OrthogonalInitializer', 'config': {'gain': 1.0, 'seed': None}, 'registered_name': None}, 'bias_initiali