# Classification of Inappropriate/Offensive Text

## Import the essentials

In [None]:
!pip install zeyrek
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import zeyrek
import re
import contextlib
import io
from tqdm import tqdm

# Text vectorizing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from radient import text_vectorizer

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier

# Bernoulli for bag-of-words, Multinomial for Tf-idf and Gaussian for sentence transformers
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

# MLP Classifier for sentence transformers
from sklearn.neural_network import MLPClassifier

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image

@contextlib.contextmanager
def suppress_output():
    with contextlib.redirect_stdout(io.StringIO()) as stdout, \
         contextlib.redirect_stderr(io.StringIO()) as stderr:
        yield (stdout, stderr)

nltk.download('stopwords')
nltk.download('punkt')

# Load the dataset
data = pd.read_csv('dataset.csv')
data.head()

## Data Preprocessing Part

In [None]:
data.info()

***Data Cleaning***

In [None]:

analyzer = zeyrek.MorphAnalyzer()

# Function to remove unnecessary user tags from the entries
def clean_text(text):

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-ZçğıöşüÇĞİÖŞÜ\s]', '', text)

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('turkish'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize words
    lemmatized_words = []
    for word in words:
        with suppress_output():
            analyses = analyzer.analyze(word)
        if analyses:
            lemmatized_words.append(analyses[0][0].lemma)
        else:
            lemmatized_words.append(word)

    return ' '.join(lemmatized_words)

# Apply the cleaning function to the 'text' column
for i in tqdm(range(data['text'].size)):
    data['text'].iloc[i] = clean_text(data['text'].iloc[i])


## Exploratory Data Analysis Results
The dataset includes 53005 rows and is about to offensive text. We can clearly see that our data has 2 columns which are not null: *text* and *label*. So we don't need to clear or fill the null values.

In [None]:
data.describe().T

### Most used words

In [None]:
def find_most_common(data, number = 50):
  words_filter = ['@', '.', 'USER', ',', '#', 've', 'bir', 'bu', 'Bu', 'de', 'ne', '!' ':', 'https', '"', '...', '``']
  text_list = data['text'].tolist()

  # Join all the strings in the list into a single string
  text = ' '.join(text_list)

  # Tokenize the text into words
  words = nltk.word_tokenize(text)
  # Count the frequency of each word
  freq_dist = FreqDist(words)
  stopwords = nltk.corpus.stopwords.words('turkish')
  dict_filter = lambda freq_dist, stopwords: dict( (word,freq_dist[word]) for word in freq_dist if (word not in stopwords and word not in words_filter and len(word) > 1) )
  filtered_freq_dist = dict_filter(freq_dist, stopwords)

  print(type(filtered_freq_dist))
  # Get the most frequent words
  most_frequent_words = FreqDist(filtered_freq_dist).most_common(number)
  return most_frequent_words

In [None]:
# Filter the symbols and most common adjuncts in Turkish
# Convert the text column to a list of strings
data_offensive = data[data['label'] == 1]
data_not_offensive = data[data['label'] == 0]

most_frequent_words_in_offensive = find_most_common(data_offensive)
most_frequent_words_not_in_offensive = find_most_common(data_not_offensive)

In [None]:
# Extract the words and frequencies from the most_frequent_words list
words, frequencies = zip(*most_frequent_words_in_offensive)

# Create a bar chart of the most frequent words
plt.figure(figsize=(15,6))
plt.bar(words, frequencies)
plt.xlabel("Words")
plt.xticks(rotation=45, ha='right')
plt.ylabel("Frequency")
plt.title("Most Frequent Words in the Offensive Sentences")
plt.show()

In [None]:
# Extract the words and frequencies from the most_frequent_words list
words, frequencies = zip(*most_frequent_words_not_in_offensive)

# Create a bar chart of the most frequent words
plt.figure(figsize=(15,6))
plt.bar(words, frequencies)
plt.xlabel("Words")
plt.xticks(rotation=45, ha='right')
plt.ylabel("Frequency")
plt.title("Most Frequent Words in the Non-offensive Sentences")
plt.show()

## Training and Prediction

In [None]:
# Use bag-of-words method to vectorize text
# 'bow' signifies the bag-of-words

data = pd.read_csv('cleaned_dataset.csv', encoding='utf-8')
vectorizer_bow = CountVectorizer(encoding='utf-8')
X_bow = vectorizer_bow.fit_transform(data['text'].values.astype(str))

# use bool type for each numeric to reduce size

X_bow = X_bow.astype(bool).toarray()

In [None]:
# Use Tf-idf vectorizer
# 'idf' refers to the Tf-idf vectorizer

vectorizer_idf = TfidfVectorizer(encoding='utf-8')
X_idf = vectorizer_idf.fit_transform(data['text'].values.astype(str))

In [None]:
# Use 'BGE-M3' model from sentence-transformers to vectorize raw sentences
# The results are vectors with 1024-dimensions. 'st' signifies the sentence transformers

X_st= []
vectorizer_st = text_vectorizer(method="sbert", model_name_or_path="BAAI/bge-m3")
data_raw = pd.read_csv('dataset.csv', encoding='utf-8')
for row in tqdm(data_raw['text'], total=data_raw.shape[0]):
    X_st.append(vectorizer_st.vectorize(str(row)))

In [None]:
# Split the resulting datasets for training and testing


X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, data['label'].astype(bool), test_size=0.2)
X_train_idf, X_test_idf, y_train_idf, y_test_idf = train_test_split(X_idf, data['label'].astype(bool), test_size=0.2)
X_train_st, X_test_st, y_train_st, y_test_st = train_test_split(X_st, data['label'].astype(bool), test_size=0.2)