In [1]:
# https://pieriantraining.com/tensorflow-lstm-example-a-beginners-guide/
# https://www.tensorflow.org/guide/keras/working_with_rnns

In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt
import tensorflow as tf
import keras
from keras.models import load_model
from keras import layers
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import joblib

In [3]:
dataset = pd.read_csv('../data/preprocessed_data.csv',encoding='ISO-8859-1')
dataset = dataset.drop(['Unnamed: 0'], axis=1)
dataset = dataset.dropna(subset=['Text'])
dataset

Unnamed: 0,Text,Source,Human
0,Federal law supersedes state law cannabis medi...,Bloom-7B,0
1,Miles feels restless working day decides go pa...,Bloom-7B,0
2,first danish means follow american politics mu...,Bloom-7B,0
3,paper present novel rulebased approach Runtime...,Bloom-7B,0
4,social progressives love democracy relatively ...,Bloom-7B,0
...,...,...,...
788917,vast expanse time echoes aeons mingle whispers...,YI-34B,0
788918,phenomenon brain drain particularly STEM field...,YI-34B,0
788919,Influence Climate Change Marine Ecosystems Cli...,YI-34B,0
788920,Title Case Limiting Car Usage Navigating Towar...,YI-34B,0


In [4]:
# Initialize the tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(dataset['Text'])

In [5]:
# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(dataset['Text'])

# Pad sequences to ensure uniform input length
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

In [6]:
# unitializing label encoder
label_encoder = LabelEncoder()
label_encoder.fit(dataset['Human'])

# labels to numerical format conversion
encoded_labels = label_encoder.transform(dataset['Human'])

In [7]:
# LSTM instantiation
lstm = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    LSTM(64, return_sequences=False),
    Dense(32, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])



In [8]:
lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [9]:
# training phase
lstm.fit(padded_sequences, encoded_labels, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m19723/19723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m289s[0m 15ms/step - accuracy: 0.7562 - loss: 0.4516 - val_accuracy: 0.7762 - val_loss: 0.4933
Epoch 2/10
[1m19723/19723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 12ms/step - accuracy: 0.8873 - loss: 0.2438 - val_accuracy: 0.8356 - val_loss: 0.4156
Epoch 3/10
[1m19723/19723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 13ms/step - accuracy: 0.9309 - loss: 0.1631 - val_accuracy: 0.8722 - val_loss: 0.3716
Epoch 4/10
[1m19723/19723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 13ms/step - accuracy: 0.9524 - loss: 0.1188 - val_accuracy: 0.8859 - val_loss: 0.3461
Epoch 5/10
[1m19723/19723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 13ms/step - accuracy: 0.9630 - loss: 0.0970 - val_accuracy: 0.8858 - val_loss: 0.3835
Epoch 6/10
[1m19723/19723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 13ms/step - accuracy: 0.9703 - loss: 0.0798 - val_accuracy: 0.8643 - val

<keras.src.callbacks.history.History at 0x26c9775c670>

In [11]:
lstm.save('../models/98_long_short_term_memory.keras')