In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

In [4]:
#Load the corpus
with open("/content/drive/MyDrive/HateSpeechNLP/hate speech data.txt","r",encoding='utf-8', errors='ignore') as file:
 text_lines = [line.strip() for line in file.readlines()]

with open("/content/drive/MyDrive/HateSpeechNLP/hate speech label.txt","r",encoding='utf-8', errors='ignore') as file:
  labels_lines = [line.strip() for line in file.readlines()]


In [5]:
# Create DataFrames from the lists
df_text = pd.DataFrame({'Text': text_lines})
df_labels = pd.DataFrame({'Label': labels_lines})

# Combine the two DataFrames
df_combined = pd.concat([df_text, df_labels], axis=1)

In [7]:
print(df_combined.head(3000))

                                                   Text Label
0     አስቀድሜ ጥያቄዬ በጨዋነት በውስጥ መስመር እንዲደርስዎ አድርጌ ፍትህን ለ...  Free
1     እነዚህን ወሳኝ ጉዳዮችን የሚያስፈፅም አካል እንዲቋቋምና ክትትል እንዲደረ...  Free
2     የአማራ ህዝብ በአእምሮ ክንፉ ያልበረረበት ጥበብና ፍልስፍና ያልከፈተው የ...  Free
3     ከአማራ ህዝብ የሀገሪቱ ዘርፈ ብዙ እውቀት መንጭቶ የሞላበትከሙላቱም በመል...  Free
4     ዛሬ በየትኛውም መለኪያ ይሁን መመዘኛ ኢትዮጵያዊነት የሚንፀባረቀው በአማራ...  Hate
...                                                 ...   ...
2995         በአሉ የሁሉም ኢትዮጵያዊ ስላልሆነ በኦሮምኛው ቢለፋደድ ምን አገባን  Hate
2996             ተባረክ አብቹ ፈር ቀዳጅ ስለሆንህ መጋረጃው መቀደድ ስለጀመረ  Free
2997  እስከ አሁን አንተ ብቻ ነው በ   መፅሀፍ ያልቻልከው  አንተም ታሪክ እን...  Free
2998  ህገወጥት ጠቅላይ ሚንስትር ፅቤት የተፈቀደ ሆኖ ህዝብን እንዴት ህግ አክብ...  Hate
2999  ደነዙ ጠቅላይ ሚንስትር ፅቤት ህገመንግስት ሳይሻሻል በህግ የተወሰነዉን የ...  Hate

[3000 rows x 2 columns]


In [8]:
#Pre-processing
train_data, test_data, train_labels, test_labels = train_test_split(df_combined['Text'], df_combined['Label'], test_size=0.2, random_state=42)

In [9]:
# Feature Extraction
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_data)
X_test = tfidf_vectorizer.transform(test_data)

In [10]:
# Model Training
model = LinearSVC(class_weight='balanced')
model.fit(X_train, train_labels)

In [12]:
# Model Evaluation
predictions = model.predict(X_test)

accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.66


In [13]:

# Display additional metrics (classification report)
print("\nClassification Report:")
print(classification_report(test_labels, predictions))


Classification Report:
              precision    recall  f1-score   support

        Free       0.66      0.66      0.66      2987
        Hate       0.66      0.67      0.66      3013

    accuracy                           0.66      6000
   macro avg       0.66      0.66      0.66      6000
weighted avg       0.66      0.66      0.66      6000



In [14]:

# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(test_labels, predictions))


Confusion Matrix:
[[1959 1028]
 [1001 2012]]


In [20]:
new_text = "ተባረክ አብቹ ፈር ቀዳጅ ስለሆንህ መጋረጃው መቀደድ ስለጀመረ"
new_text_vectorized = tfidf_vectorizer.transform([new_text]) # Transform the new text using the same vectorizer
prediction = model.predict(new_text_vectorized)
print(prediction)

['Free']


In [21]:
pickle.dump(model,open('hate_speech_detection_model.pkl','wb'))