# INITIALIZE MODEL PREDICT

In [1]:
# Import Library
import io
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization  # tokenization|
import pickle

# Load Model
model = tf.keras.models.load_model('../load_model_integration/toxic-v1.h5')

# Lables Predict
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# toxic == toxic
# sever_toxic == toxic_parah
# obscene == cabul
# threat == ancaman
# insult == menyinggung
# indentity_hate == benci personal

# Import Vectorizer
with open('vectorizer_config.pkl', 'rb') as f:
    vectorizer_config = pickle.load(f)
with open('vectorizer_vocab.pkl', 'rb') as f:
    vectorizer_vocab = pickle.load(f)

# Set Vectorizer
vectorizer = TextVectorization.from_config(vectorizer_config)
vectorizer.set_vocabulary(vectorizer_vocab)


# Function predict threat dan hate pada teks
def predict(teks):
    # Input data teks
    input_data = teks
    # Membuat vektor setiap teks masukan dalam daftar
    vectorized_texts = [vectorizer(text) for text in input_data]
    # Pad urutan dengan panjang yang sama
    padded_texts = tf.keras.preprocessing.sequence.pad_sequences(vectorized_texts, maxlen=1800)
    # Melakukan prediksi
    predictions = model.predict(padded_texts)
    binary_predictions = (predictions > 0.5).astype(int)
    # Membuat buffer untuk menyimpan output
    output_buffer = io.StringIO()
    label_index = {label: [] for label in labels}
    total_predictions = binary_predictions.sum(axis=0)
    for i, (prediction, text) in enumerate(zip(binary_predictions, input_data)):
        # Menulis ke buffer alih-alih mencetak langsung
        for j, (label, pred) in enumerate(zip(labels, prediction)):
            if pred == 1:
                output_buffer.write(f"Text: {text}\n")
                output_buffer.write(f"Prediction: {label} Value: {predictions[i][j]}\n")
                label_index[label].append(i)  # Menyimpan indeks di mana label diprediksi sebagai 1
        output_buffer.write("\n")
    for label, total in zip(labels, total_predictions):
        output_buffer.write(f"Total {label} predictions: {total}\n")
        output_buffer.write(f"Index Kalimat yang terdeteksi {label}: {label_index[label]}\n")
        output_buffer.write("\n")
    # Mendapatkan semua output sebagai string
    output_string = output_buffer.getvalue()
    # Jangan lupa untuk menutup buffer setelah selesai
    output_buffer.close()
    return output_string

ValueError: Weight count mismatch for layer #1 (named bidirectional in the current model, bidirectional in the save file). Layer expects 9 weight(s). Received 6 saved weight(s)

# INITIALIZE SPEECH API

In [None]:
# Import Library
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech as cloud_speech_types
from google.oauth2 import service_account


def transkrip_dan_predict(project_id, api_key, audio_file):

    # Instantiates a client
    credentials = service_account.Credentials.from_service_account_file(api_key)
    client = SpeechClient(credentials=credentials)

    # Reads a file as bytes
    with open(audio_file, "rb") as f:
        content = f.read()

    sample_rate = 16000  # Misalkan sample rate adalah 16000 Hz (16kHz)
    desired_duration = 1  # Durasi yang diinginkan untuk setiap chunk dalam detik

    # In practice, stream should be a generator yielding chunks of audio data
    # chunk_length = len(content) // 1000
    chunk_length = sample_rate * desired_duration
    stream = [
        content[start: start + chunk_length]
        for start in range(0, len(content), chunk_length)
    ]
    audio_requests = (
        cloud_speech_types.StreamingRecognizeRequest(audio=audio) for audio in stream
    )

    recognition_config = cloud_speech_types.RecognitionConfig(
        auto_decoding_config=cloud_speech_types.AutoDetectDecodingConfig(),
        language_codes=["en-US"],
        model="long",
    )
    streaming_config = cloud_speech_types.StreamingRecognitionConfig(
        config=recognition_config
    )
    config_request = cloud_speech_types.StreamingRecognizeRequest(
        recognizer=f"projects/{project_id}/locations/global/recognizers/_",
        streaming_config=streaming_config,
    )

    def requests(config: cloud_speech_types.RecognitionConfig, audio: list) -> list:
        yield config
        yield from audio

    # Transcribes the audio into text
    responses_iterator = client.streaming_recognize(
        requests=requests(config_request, audio_requests)
    )

    responses = []
    list_transkrip = []

    for response in responses_iterator:
        responses.append(response)
        for result in response.results:
            predict([str(result.alternatives[0].transcript)])

# PROGRAM UTAMA

In [None]:
transkrip_dan_predict(project_id="data-science-programming-ti24",
                        api_key="./gcloud_apikey.json",
                        audio_file="./Donald Trump.mp3")