# Logistic Regression for Hate Speech Detection

This notebook demonstrates preprocessing, training, and evaluation of a Logistic Regression model for hate speech detection.

In [1]:

# Import libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


## Step 1: Load and Preprocess Data

In [2]:

# Load the dataset (replace 'Oversampled_Tweet_Dataset.csv' with your actual file path)
data = pd.read_csv('Oversampled_Tweet_Dataset.csv')

# Function to clean text data
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    text = text.strip()  # Remove leading/trailing spaces
    return text

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Map labels to binary values ('no-hate' -> 0, 'hate' -> 1)
data['label'] = data['label'].map({'no-hate': 0, 'hate': 1})


## Step 2: TF-IDF Vectorization

In [None]:

# Convert the cleaned text into TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 most important features
X = tfidf_vectorizer.fit_transform(data['cleaned_text'])  # Features
y = data['label']  # Labels


## Step 3: Split Data into Training and Testing Sets

In [None]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## Step 4: Train Logistic Regression Model

In [None]:

# Train a Logistic Regression model
model_lr = LogisticRegression(max_iter=1000, random_state=42)  # Logistic Regression
model_lr.fit(X_train, y_train)  # Train the model


## Step 5: Evaluate the Model

In [None]:

# Predict on test data
y_pred = model_lr.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['no-hate', 'hate']))

# Confusion Matrix
print("\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# (Optional) Visualization of Confusion Matrix
plt.figure(figsize=(6, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['no-hate', 'hate'], yticklabels=['no-hate', 'hate'])
plt.title('Confusion Matrix for Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
