# Classification of Inappropriate/Offensive Text

## Import the essentials

In [None]:
!pip install zeyrek
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import zeyrek
import re
import contextlib
import io
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')

# Load the dataset
data = pd.read_csv('dataset.csv')
data.head()

@contextlib.contextmanager
def suppress_output():
    with contextlib.redirect_stdout(io.StringIO()) as stdout, \
         contextlib.redirect_stderr(io.StringIO()) as stderr:
        yield (stdout, stderr)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data Preprocessing Part

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53005 entries, 0 to 53004
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    53005 non-null  object
 1   label   53005 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 828.3+ KB


***Data Cleaning***

In [None]:

analyzer = zeyrek.MorphAnalyzer()

# Function to remove unnecessary user tags from the entries
def clean_text(text):

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-ZçğıöşüÇĞİÖŞÜ\s]', '', text)

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('turkish'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize words
    lemmatized_words = []
    for word in words:
        with suppress_output():
            analyses = analyzer.analyze(word)
        if analyses:
            lemmatized_words.append(analyses[0][0].lemma)
        else:
            lemmatized_words.append(word)

    return ' '.join(lemmatized_words)

# Apply the cleaning function to the 'text' column
for i in tqdm(range(data['text'].size)):
    data['text'].iloc[i] = clean_text(data['text'].iloc[i])


[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'].iloc[i] = clean_text(data['text'].iloc[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'].iloc[i] = clean_text(data['text'].iloc[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'].iloc[i] = clean_text(data['text'].iloc[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stab

## Exploratory Data Analysis Results
The dataset includes 53005 rows and is about to offensive text. We can clearly see that our data has 2 columns which are not null: *text* and *label*. So we don't need to clear or fill the null values.

In [None]:
data.describe().T

### Most used words

In [None]:
def find_most_common(data, number = 50):
  words_filter = ['@', '.', 'USER', ',', '#', 've', 'bir', 'bu', 'Bu', 'de', 'ne', '!' ':', 'https', '"', '...', '``']
  text_list = data['text'].tolist()

  # Join all the strings in the list into a single string
  text = ' '.join(text_list)

  # Tokenize the text into words
  words = nltk.word_tokenize(text)
  # Count the frequency of each word
  freq_dist = FreqDist(words)
  stopwords = nltk.corpus.stopwords.words('turkish')
  dict_filter = lambda freq_dist, stopwords: dict( (word,freq_dist[word]) for word in freq_dist if (word not in stopwords and word not in words_filter and len(word) > 1) )
  filtered_freq_dist = dict_filter(freq_dist, stopwords)

  print(type(filtered_freq_dist))
  # Get the most frequent words
  most_frequent_words = FreqDist(filtered_freq_dist).most_common(number)
  return most_frequent_words

In [None]:
# Filter the symbols and most common adjuncts in Turkish
# Convert the text column to a list of strings
data_offensive = data[data['label'] == 1]
data_not_offensive = data[data['label'] == 0]

most_frequent_words_in_offensive = find_most_common(data_offensive)
most_frequent_words_not_in_offensive = find_most_common(data_not_offensive)

In [None]:
# Extract the words and frequencies from the most_frequent_words list
words, frequencies = zip(*most_frequent_words_in_offensive)

# Create a bar chart of the most frequent words
plt.figure(figsize=(15,6))
plt.bar(words, frequencies)
plt.xlabel("Words")
plt.xticks(rotation=45, ha='right')
plt.ylabel("Frequency")
plt.title("Most Frequent Words in the Offensive Sentences")
plt.show()

In [None]:
# Extract the words and frequencies from the most_frequent_words list
words, frequencies = zip(*most_frequent_words_not_in_offensive)

# Create a bar chart of the most frequent words
plt.figure(figsize=(15,6))
plt.bar(words, frequencies)
plt.xlabel("Words")
plt.xticks(rotation=45, ha='right')
plt.ylabel("Frequency")
plt.title("Most Frequent Words in the Non-offensive Sentences")
plt.show()

## Training and Prediction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image

data2 = pd.read_csv('cleaned_dataset.csv')
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(data2['text'])

feature_names = vectorizer.get_feature_names_out()

X_array = X.astype(bool).toarray()

X_train, X_test, y_train, y_test = train_test_split(X_array, data2['label'].astype(bool), test_size=0.2)






In [None]:
print(len(feature_names), len(X_array))

In [None]:
rf = RandomForestClassifier(n_estimators=10, verbose=2, n_jobs=2, min_samples_leaf=3, max_depth=6)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)