In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [3]:
db=pd.read_csv('Cleaned_Nepali_dataset.csv')
db.head()

Unnamed: 0,Text,Target
0,गुठी विधेक ल्याएर ठमेल राज गुठि जग्गा छाया सेन...,0
1,दले देश सकेछन बेचे खान सुरू गरेछन दले लखेटनु पछ ।,1
2,नेपाल ससकृती ध्वस्त पार्ने योजना !,1
3,मठ मन्दिर गुम्बा जग्गा हरु भुमाफिया नजर परे हु...,1
4,नेपाल कल कर्खाना नदि नाला बेची सके मठ मन्दीर ब...,1


In [4]:

db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2639 entries, 0 to 2638
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    2639 non-null   object
 1   Target  2639 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 41.4+ KB


In [5]:
db.describe()

Unnamed: 0,Target
count,2639.0
mean,1.275862
std,1.379088
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,5.0


In [6]:
db.isnull().sum()

Text      0
Target    0
dtype: int64

# Preprocessing

In [7]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [8]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\shres\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shres\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shres\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
stop_word = nltk.corpus.stopwords.words('nepali')
print('Number of stop words', len(stop_word))

Number of stop words 255


In [10]:
import re

def tokenize(text):
    if not isinstance(text, str):
        text = str(text)

    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Strip whitespace
    text = text.strip()

    # Sentence tokenization
    sentences = nltk.sent_tokenize(text)

    # Word tokenization and remove stopwords
    stop_words = nltk.corpus.stopwords.words('nepali')
    words = [
        word for sentence in sentences
        for word in nltk.word_tokenize(sentence)
        if word.lower() not in stop_words
    ]

    return words


In [11]:

db['text_tokenize']=db['Text'].apply(tokenize)
db.head()

Unnamed: 0,Text,Target,text_tokenize
0,गुठी विधेक ल्याएर ठमेल राज गुठि जग्गा छाया सेन...,0,"[गठ, वधक, लयएर, ठमल, रज, गठ, जगग, छय, सनटर, जस..."
1,दले देश सकेछन बेचे खान सुरू गरेछन दले लखेटनु पछ ।,1,"[दल, दश, सकछन, बच, खन, सर, गरछन, दल, लखटन, पछ]"
2,नेपाल ससकृती ध्वस्त पार्ने योजना !,1,"[नपल, ससकत, धवसत, परन, यजन]"
3,मठ मन्दिर गुम्बा जग्गा हरु भुमाफिया नजर परे हु...,1,"[मठ, मनदर, गमब, जगग, हर, भमफय, नजर, पर, हन, वधक]"
4,नेपाल कल कर्खाना नदि नाला बेची सके मठ मन्दीर ब...,1,"[नपल, कल, करखन, नद, नल, बच, सक, मठ, मनदर, बच, ..."


# vectorization

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [13]:
db["text_tokenize"] = db["text_tokenize"].apply(lambda tokens: " ".join(tokens))

In [14]:
vectorizer = TfidfVectorizer()
text_idf = vectorizer.fit_transform(db["text_tokenize"])

In [15]:

db["text_idf"] = list(text_idf.toarray())
db.head(3)

Unnamed: 0,Text,Target,text_tokenize,text_idf
0,गुठी विधेक ल्याएर ठमेल राज गुठि जग्गा छाया सेन...,0,गठ वधक लयएर ठमल रज गठ जगग छय सनटर जसत जगउन लयउ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,दले देश सकेछन बेचे खान सुरू गरेछन दले लखेटनु पछ ।,1,दल दश सकछन बच खन सर गरछन दल लखटन पछ,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,नेपाल ससकृती ध्वस्त पार्ने योजना !,1,नपल ससकत धवसत परन यजन,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [16]:
idf_df = db[[ "text_idf", "Target"]].copy()

In [17]:
idf_df.head()

Unnamed: 0,text_idf,Target
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1


In [18]:
idf_df["text_idf"] = idf_df["text_idf"].apply(lambda x: np.array(x).flatten())

# Creating model for hate speech analysis

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x= idf_df[[ "text_idf"]]
y= idf_df["Target"]

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print('Training set shape :',x_train.shape,y_train.shape)
print('Testing set shape :',x_test.shape,y_test.shape)

Training set shape : (2111, 1) (2111,)
Testing set shape : (528, 1) (528,)


In [22]:
import tensorflow as tf
from keras import Sequential
from keras.layers import Embedding ,LSTM ,Dense,Dropout
from keras.preprocessing.sequence import pad_sequences

In [27]:
max_sequence_length = x_train['text_idf'].apply(len).max()
# Flatten the lists on the Dataframe columnsabs
x_train_text_padded = pad_sequences(np.array(x_train['text_idf'].tolist()), maxlen=max_sequence_length)
x_test_text_padded = pad_sequences(np.array(x_test['text_idf'].tolist()), maxlen=max_sequence_length)


In [31]:
#Define the LSTM model
model=Sequential()

model.add(Dense(128,activation='relu',input_shape=(x_train_text_padded.shape[1],)))
model.add(Dropout(0.2)) # Add dropout for regularization
model.add(tf.keras.layers.Reshape((1,128))) # Reshape to (batch_size,time_steps)

# NOw add the LSTM layer
model.add(LSTM(64)) #We can adjust the number of LSTM units
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid')) #Output layer for binary classification

# Compile the model
model.compile(optimizer='adam',loss="binary_crossentropy",metrics=["accuracy"])

# Train the model 
history=model.fit(x_train_text_padded,y_train,epochs=10,batch_size=32,validation_split=0.2)

#Evaluate the model
loss,accuracy=model.evaluate(x_test_text_padded,y_test)
print(f"Test loss : {loss}")
print(f"Test Accuracy : {accuracy}")

Epoch 1/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.3681 - loss: 0.6434 - val_accuracy: 0.3735 - val_loss: 0.2269
Epoch 2/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.3857 - loss: -0.1355 - val_accuracy: 0.3735 - val_loss: -1.6452
Epoch 3/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.3744 - loss: -1.8728 - val_accuracy: 0.3735 - val_loss: -4.0561
Epoch 4/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.3603 - loss: -4.3921 - val_accuracy: 0.3735 - val_loss: -5.9243
Epoch 5/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.3826 - loss: -5.4195 - val_accuracy: 0.3735 - val_loss: -7.1613
Epoch 6/10
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.3752 - loss: -6.8081 - val_accuracy: 0.3735 - val_loss: -8.1528
Epoch 7/10
[1m53/53[0m