In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords

In [5]:
nepali_stopwords = set(stopwords.words('nepali'))
english_stopwords = set(stopwords.words('english'))

In [6]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove specified characters
    text = re.sub('[#\\/।(),०-९<<?!,—–’‘:\u200d]', '', text)
    # Strip double quotes
    text = text.strip('"')

    # Tokenize the text
    words=word_tokenize(text)
    # Remove stop words for both Nepali and English
    filtered_words = [word for word in words if word.lower() not in nepali_stopwords and word.lower() not in english_stopwords]
    # Join the filtered words to form the processed text
    processed_text = ' '.join(filtered_words)
    return processed_text


In [8]:


# Load the preprocessed data
df_final = pd.read_excel("dataset/updated_text_data.xlsx")
df_final['गुनासो'] = df_final['गुनासो'].apply(preprocess_text)

# Tokenization and Padding
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df_final['गुनासो'])
X_seq = tokenizer.texts_to_sequences(df_final['गुनासो'])
X_padded = pad_sequences(X_seq, maxlen=max_len)

# Encoding Labels
le = LabelEncoder()
y = le.fit_transform(df_final['गुनासो वर्ग'])

# Define the LSTM model
embedding_dim = 50
lstm_units = 100
num_classes = len(np.unique(y))

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(units=lstm_units, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Split the data and train the model
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42, stratify=y)
epochs = 10
batch_size = 32
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Save the model
model.save("lstm_model.h5")

# Save the Tokenizer and Label Encoder
with open('tokenizer.pkl', 'wb') as tokenizer_file:
    tokenizer_file.write(tokenizer.to_json())

with open('label_encoder.pkl', 'wb') as encoder_file:
    encoder_file.write(le.to_json())

# Make predictions on new data
new_text = 'मेरो घर अगाडी सडक दुर्घटना भएको ले अबरुधा छ | बाटो कहिले बन्छ ?'
new_text_seq = tokenizer.texts_to_sequences([new_text])
new_text_padded = pad_sequences(new_text_seq, maxlen=max_len)
predicted_class = np.argmax(model.predict(new_text_padded), axis=-1)
decoded_data = le.inverse_transform(predicted_class)
print(decoded_data)


Epoch 1/10


2023-12-08 21:52:29.585706: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fa1ac00d290 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-12-08 21:52:29.585740: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1650, Compute Capability 7.5
2023-12-08 21:52:29.598313: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-12-08 21:52:29.670481: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8902
2023-12-08 21:52:29.766541: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 61.20%


  saving_api.save_model(


TypeError: a bytes-like object is required, not 'str'