In [1]:
import pandas as pd

# Load the dataset (modify the filename if needed)
df = pd.read_csv("train.csv")

# Display the first few rows
df.head()


Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [2]:
# Keeping only relevant columns: 'class' and 'tweet'
df = df[['class', 'tweet']]

# Rename columns for clarity
df.rename(columns={'class': 'label', 'tweet': 'text'}, inplace=True)

df.head()


Unnamed: 0,label,text
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
import re

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['clean_text'] = df['text'].apply(clean_text)

df[['text', 'clean_text']].head()


Unnamed: 0,text,clean_text
0,!!! RT @mayasolovely: As a woman you shouldn't...,rt as a woman you shouldnt complain about clea...
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,rt boy dats coldtyga dwn bad for cuffin dat ho...
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,rt dawg rt you ever fuck a bitch and she start...
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,rt she look like a tranny
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,rt the shit you hear about me might be true or...


In [4]:
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def fast_text_processing(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))  # Lowercase & remove punctuation
    words = text.split()  # Tokenize using split
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]  # Remove stopwords
    return ' '.join(words)

df['processed_text'] = df['clean_text'].apply(fast_text_processing)

df[['clean_text', 'processed_text']].head()


Unnamed: 0,clean_text,processed_text
0,rt as a woman you shouldnt complain about clea...,rt woman shouldnt complain cleaning house amp ...
1,rt boy dats coldtyga dwn bad for cuffin dat ho...,rt boy dats coldtyga dwn bad cuffin dat hoe st...
2,rt dawg rt you ever fuck a bitch and she start...,rt dawg rt fuck bitch start confused shit
3,rt she look like a tranny,rt look like tranny
4,rt the shit you hear about me might be true or...,rt shit hear true faker bitch told ya


In [5]:
import nltk
from nltk.stem import WordNetLemmatizer

# Download WordNet if not available
nltk.download('wordnet')

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text_nltk(text):
    words = text.split()  # Simple tokenization
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return ' '.join(words)

df['lemmatized_text'] = df['processed_text'].apply(lemmatize_text_nltk)

df[['processed_text', 'lemmatized_text']].head()


[nltk_data] Downloading package wordnet to C:\Users\Admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,processed_text,lemmatized_text
0,rt woman shouldnt complain cleaning house amp ...,rt woman shouldnt complain cleaning house amp ...
1,rt boy dats coldtyga dwn bad cuffin dat hoe st...,rt boy dat coldtyga dwn bad cuffin dat hoe st ...
2,rt dawg rt fuck bitch start confused shit,rt dawg rt fuck bitch start confused shit
3,rt look like tranny,rt look like tranny
4,rt shit hear true faker bitch told ya,rt shit hear true faker bitch told ya


In [6]:
df.to_csv("preprocessed_hate_speech.csv", index=False)
print("Preprocessed data saved successfully!")

Preprocessed data saved successfully!


In [7]:
import pandas as pd

# Check unique values in 'label' column
print(df['label'].unique())

# Convert integer labels into separate binary columns
df['hate_speech'] = (df['label'] == 0).astype(int)
df['offensive'] = (df['label'] == 1).astype(int)
df['neutral'] = (df['label'] == 2).astype(int)

# Create the final multi-label target dataframe
y_df = df[['hate_speech', 'offensive', 'neutral']]

# Display first few rows
print(y_df.head(10))


[2 1 0]
   hate_speech  offensive  neutral
0            0          0        1
1            0          1        0
2            0          1        0
3            0          1        0
4            0          1        0
5            0          1        0
6            0          1        0
7            0          1        0
8            0          1        0
9            0          1        0


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Reduce feature size to avoid memory errors
tfidf = TfidfVectorizer(max_features=1000)  # Reduced from 2000 → 1000

# Convert processed text to TF-IDF representation (Keep Sparse Format)
X_tfidf = tfidf.fit_transform(df['processed_text'])

# Convert to DataFrame (Sparse Format)
X_df = pd.DataFrame.sparse.from_spmatrix(X_tfidf, columns=tfidf.get_feature_names_out())

# Display shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X_df.shape)


TF-IDF Matrix Shape: (24783, 1000)


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np

# Convert label column to multi-hot encoding (One-Hot Encode Labels)
y_df = pd.get_dummies(df['label'])

# Convert to float32 to reduce memory usage
X_df = X_df.astype(np.float32)
y_df = y_df.astype(np.float32)

# Split Data into Train & Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

# Initialize Multi-Label Classifier (One-vs-Rest with Logistic Regression)
model = OneVsRestClassifier(LogisticRegression())

# Train the model
model.fit(X_train, y_train)

print("✅ Model training complete!")


✅ Model training complete!


In [10]:
# Make predictions on test data
y_pred = model.predict(X_test)

# Convert predictions to a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=y_test.columns)

# Display first 5 predictions
y_pred_df.head()


Unnamed: 0,0,1,2
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0


In [11]:
from sklearn.metrics import classification_report

# Generate classification report
print(classification_report(y_test, y_pred_df))


              precision    recall  f1-score   support

           0       0.50      0.14      0.22       290
           1       0.92      0.95      0.93      3832
           2       0.85      0.74      0.79       835

   micro avg       0.90      0.86      0.88      4957
   macro avg       0.76      0.61      0.65      4957
weighted avg       0.88      0.86      0.87      4957
 samples avg       0.86      0.86      0.86      4957



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
def predict_text(text):
    text_tfidf = tfidf.transform([text])
    text_df = pd.DataFrame(text_tfidf.toarray(), columns=tfidf.get_feature_names_out())

    # Fill missing columns
    missing_cols = set(X_df.columns) - set(text_df.columns)
    for col in missing_cols:
        text_df[col] = 0
    text_df = text_df[X_df.columns]

    prediction = model.predict(text_df)
    label_index = list(y_test.columns).index(0)  # Index for hate_speech

    if prediction[0][label_index] == 1:
        return "🔥 Hate Speech Detected"
    else:
        return "✅ Not Hate Speech"

# Take input from user (like Scanner in Java)
user_input = input("Enter a sentence to analyze: ")
result = predict_text(user_input)
print(result)


✅ Not Hate Speech


In [14]:
print(df['label'].value_counts())

label
1    19190
2     4163
0     1430
Name: count, dtype: int64
