In [1]:
import joblib
import re
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from Score import score  
import pandas as pd

with open("vulgarity_model.pkl", "rb") as model_file:
    model = joblib.load(model_file)


with open("Slang_data/en.txt", "r") as cens_lang:
    cens_word = {line.strip().lower() for line in cens_lang}


vectorizer = joblib.load("tfidf_vectorizer.pkl")


def censor_text(text, threshold):
    
    
    
    text_list = re.findall(r'\b[A-Za-z0-9]+\b', text.lower())

    
    X_tfidf = vectorizer.transform([" ".join(text_list)])
    
    
    explicit_feature = [[1 if any(word in cens_word for word in text_list) else 0]]

    
    X_combined = hstack((X_tfidf, explicit_feature))

    
    prediction = model.predict(X_combined)[0]

    if prediction == 0:  
        return text  

    
    score_value = score(text, cens_word)

    if score_value >= threshold:
        return "[CENSORED]"  
    else:
        
        censored_text = " ".join("[CENSORED]" if word in cens_word else word for word in text_list)
        return censored_text





In [2]:
data_set = pd.read_csv('labeled_data.csv')
data_set

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [None]:
for index,key in enumerate(data_set['tweet']):
    data_set['test'][index]=censor_text(key,0.7)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data_set['test'][index]=censor_text(key,0.7)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_set['test'][i

In [20]:
data_set

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,test
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,rt urkindofbrand dawg rt 80sbaby4life you ever...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,rt she look like a [CENSORED]
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,rt shenikaroberts the [CENSORED] you hear abou...
...,...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an...","you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...,young buck wanna eat dat nigguh like i aint [C...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies,youu got wild [CENSORED] tellin you lies


In [22]:
import re

# Initialize counters
correct_predictions = 0  # When hate speech is 1 and we detected it
false_negatives = 0      # When hate speech is 1 but we missed it

for index, label in enumerate(data_set['hate_speech']):
    if label == 1:  # Only check for actual hate speech texts
        text = data_set['test'][index]  # Access the text after censorship

        # Check if '[CENSORED]' appears in the text
        if re.search(r'\bCENSORED\b', text):
            correct_predictions += 1  # Our system successfully detected and censored
        else:
            false_negatives += 1  # Our system missed this hate speech

# Print results
print("Correctly Censored Hate Speech:", correct_predictions)
print("Missed Hate Speech (False Negatives):", false_negatives)

# Optional: Calculate detection accuracy
total_hate_speech = correct_predictions + false_negatives
if total_hate_speech > 0:
    accuracy = (correct_predictions / total_hate_speech) * 100
    print(f"Model Accuracy on Hate Speech: {accuracy:.2f}%")
else:
    print("No hate speech samples found in dataset.")

Correctly Censored Hate Speech: 2432
Missed Hate Speech (False Negatives): 987
Model Accuracy on Hate Speech: 71.13%
