In [None]:
import re    # for regular expressions
import nltk  # for text manipulation
from nltk.corpus import stopwords
import string # for text manipulation
import warnings
import pandas as pd # for data manipulation
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore") #ignore warnings

import os
for dirname, _, filenames in os.walk('./sample_data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

%matplotlib inline

In [None]:
data = pd.read_csv("./sample_data/processed_full.csv", sep=',')
data.head()

In [None]:
data = data[:20000]

In [None]:
DATASET_COLUMNS = ["target", "text"]
data.columns = DATASET_COLUMNS
data.head()

In [None]:
nan_count = data.isna().sum().sum()
nan_count

In [None]:
data = data[data['text'].notnull()]

In [None]:
def tokenize_texts(texts_list):
    # Tüm metinleri kelimelere böl ve benzersiz kelimeleri elde et
    all_words = set()
    for text in texts_list:
        words = str(text).split()
        all_words.update(words)

    # Benzersiz kelimelere bir tam sayı değeri ata
    word_to_index = {word: i for i, word in enumerate(all_words)}

    # Metinleri tam sayı dizisine dönüştür
    tokenized_texts = []
    for text in texts_list:
        words = text.split()
        tokenized_texts.append([word_to_index[word] for word in words])

    return tokenized_texts, word_to_index

texts = data['text']
labels = data['target']
tokenized_texts, word_to_index = tokenize_texts(texts)

In [None]:
def pad_tokenized_texts(tokenized_texts, max_length=None):
    if not max_length:
        max_length = max([len(text) for text in tokenized_texts])

    padded_texts = []
    for text in tokenized_texts:
        if len(text) < max_length:
            text += [0] * (max_length - len(text))
        padded_texts.append(text)

    return padded_texts

padded_texts = pad_tokenized_texts(tokenized_texts)

In [None]:
vocab_size = len(word_to_index) + 1  # +1 ekledik çünkü 0 için padding değerini kullanıyoruz.
max_length = len(padded_texts[0])

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(vocab_size, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')  # 5 sınıf olduğu için 5 nöron kullandık ve softmax aktivasyonunu kullandık.
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=16, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')  # 5 sınıf olduğu için 5 nöron kullandık ve softmax aktivasyonunu kullandık.
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
labels_array = np.array(labels)

In [None]:
model.fit(np.array(padded_texts), labels_array, epochs=5, validation_split=0.2)

In [None]:
!pip install tf2onnx
!pip install onnx

In [None]:
import tf2onnx
import onnx

In [None]:
input_signature = [tf.TensorSpec([None, max_length], tf.float32, name='x')]
onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature, opset=13)
onnx.save(onnx_model, "./sample_data/model.onnx")

In [None]:
from onnx import numpy_helper
onnx_model   = onnx.load("./sample_data/model.onnx")
INTIALIZERS  = onnx_model.graph.initializer
onnx_weights = {}
for initializer in INTIALIZERS:
    W = numpy_helper.to_array(initializer)
    onnx_weights[initializer.name] = W

In [None]:
onnx_weights