In [None]:
!pip install datasets stanza



In [None]:
from datasets import load_dataset

dataset = load_dataset("Overfit-GM/turkish-toxic-language")

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'target', 'source', 'is_toxic'],
        num_rows: 77800
    })
})

In [2]:
# ignore unnecessary warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# numpy and pandas for process and analyze
import pandas as pd
import numpy as np

# stanza for NLP preprocessing
!pip install stanza
import stanza

# sklearn libraries for models and evaulation metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

# matpilotlib and seaborn libraries for visualize
import matplotlib.pyplot as plt
import seaborn as sns

# progress bar for data processing
from tqdm import tqdm

# install and download stanza's turkish model
stanza.download("tr", verbose=False)
print("Stanza Turkish model downloaded!")

nlp = stanza.Pipeline("tr", use_gpu=False)

print("Libraries and NLTK datasets loaded!")

#downloading the necessary parts for preprocessing
nlp = stanza.Pipeline(lang='tr', processors='tokenize,mwt,lemma')

#defining the curse words
curse_words = {"lan", "oç", "orospu", "siktir", "piç", "salak", "gerizekalı","aptal","mal","göt","götveren"}

def preprocess_and_detect_curse_words(dataset):
    #we define a list and it will be filled with the curses that are being captured in the process
    detected_curse_words = []

    #process the dataset using stanza
    doc = nlp(dataset)

    # Iterate through sentences and tokens
    for sentence in doc.sentences:
        for word in sentence.words:
            lemma = word.lemma  # Lemmatized version of the word
            if lemma in curse_words:
                detected_curse_words.append({
                    "word": word.text,
                    "lemma": lemma,
                    "sentence": "".join([w.text for w in sentence.words])  # Full sentence for context
                })

    return detected_curse_words


Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.10.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.10.1
Stanza Turkish model downloaded!
Libraries and NLTK datasets loaded!
