# **Import Libraries**

In [1]:
import pickle # To save model for later
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer # Convert text to vectors
from sklearn.model_selection import train_test_split

In [3]:
import nltk
from nltk import word_tokenize # To optimize the text
from nltk.corpus import stopwords # To remove stopwords
import re # Regular Expression
import string # Punctuation

# **Upload the Dataset**

In [5]:
from google.colab import files
uploaded = files.upload()

Saving dataset.csv to dataset.csv


In [57]:
# Read the dataset
df = pd.read_csv('dataset.csv')

# **Data Exploration**

In [58]:
# Display first 5 rows
df.head()

Unnamed: 0,id,B,Body,Label
0,1702,1702,-Ayrıca redmi note 9 aldım almaz olsaydım.,1
1,2054,2054,-Cihazın kendisine geldiğimde ise,1
2,2059,2059,"-Genel olarak toparlamam gerekirse, cihazın er...",1
3,1701,1701,-Redmi note 8 sağlam telefon.,1
4,1823,1823,-Yanları. Telefon ağır ve kamera çıkıntısı büy...,1


In [59]:
# Get info for dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2391 entries, 0 to 2390
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      2391 non-null   int64 
 1   B       2391 non-null   int64 
 2   Body    2391 non-null   object
 3   Label   2391 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 74.8+ KB


In [60]:
# Sort the dataset
df.sort_values('Body', ascending=False)

Unnamed: 0,id,B,Body,Label
2390,1162,1162,şimdilik bir kapasitede kameralı olan dışında ...,0
2389,1270,1270,şekerimizliğindeki tozudurdum söz konusu değil...,0
2388,1265,1265,şaşırmıyoruz ilgisinden farklı ince küçük sank...,0
2387,1276,1276,şaşırdık\ngöründüğü değil büyük değil note yi ...,0
2386,2412,2412,şarj olayı maalesef kötü onun dışında harika b...,1
...,...,...,...,...
4,1823,1823,-Yanları. Telefon ağır ve kamera çıkıntısı büy...,1
3,1701,1701,-Redmi note 8 sağlam telefon.,1
2,2059,2059,"-Genel olarak toparlamam gerekirse, cihazın er...",1
1,2054,2054,-Cihazın kendisine geldiğimde ise,1


In [62]:
# Check Label's unique values
df['Label'].unique()

array([1, 0])

# **Data Cleaning**

In [61]:
# Drop duplicates
df= df.drop_duplicates(subset=['Body'], keep = False, inplace=True)

In [39]:
# Download ready to used stopwords and punctuation
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [63]:
# Function to optimize/clean the text
def optimization(text_series):

    # Ensure the input is a pandas Series
    if not isinstance(text_series, pd.Series):
        raise TypeError("Input must be a pandas Series")

    # Remove missing values
    cleaned_series = text_series.dropna()

    # Stop words list in Turkish
    stop_words = set(stopwords.words('turkish'))

    # Apply the cleaning steps to each string in the Series
    cleaned_series = cleaned_series.str.lower()
    cleaned_series = cleaned_series.str.translate(str.maketrans('', '', string.punctuation))
    cleaned_series = cleaned_series.apply(word_tokenize) # Apply word tokenization

    # Remove Stopwords
    cleaned_series = cleaned_series.apply(lambda tokens: [word for word in tokens if word not in stop_words])

    # Convert list of tokens back to a string
    cleaned_series = cleaned_series.apply(lambda tokens: ' '.join(tokens))

    return cleaned_series

In [64]:
# Apply the optimization function to the 'Body' column
df['Body_cleaned'] = optimization(df['Body'])

In [65]:
df.head()

Unnamed: 0,id,B,Body,Label,Body_cleaned
0,1702,1702,-Ayrıca redmi note 9 aldım almaz olsaydım.,1,ayrıca redmi note 9 aldım almaz olsaydım
1,2054,2054,-Cihazın kendisine geldiğimde ise,1,cihazın kendisine geldiğimde
2,2059,2059,"-Genel olarak toparlamam gerekirse, cihazın er...",1,genel olarak toparlamam gerekirse cihazın ergo...
3,1701,1701,-Redmi note 8 sağlam telefon.,1,redmi note 8 sağlam telefon
4,1823,1823,-Yanları. Telefon ağır ve kamera çıkıntısı büy...,1,yanları telefon ağır kamera çıkıntısı büyük iy...


In [68]:
# Drop cloumns "Body" and "B"
df = df.drop(columns="Body", axis=1)
df = df.drop(columns="B", axis=1 )

In [72]:
df.head()

Unnamed: 0,id,Label,Body_cleaned
0,1702,1,ayrıca redmi note 9 aldım almaz olsaydım
1,2054,1,cihazın kendisine geldiğimde
2,2059,1,genel olarak toparlamam gerekirse cihazın ergo...
3,1701,1,redmi note 8 sağlam telefon
4,1823,1,yanları telefon ağır kamera çıkıntısı büyük iy...


In [73]:
# Seperate label as machine/human
comm_machine = df[df['Label']==0]
comm_human = df[df['Label']==1]

# **Vectorization**

In [74]:
# Term Frequency - Inverse Document Frequency
# Convert text to numerical vector
tfIdf = TfidfVectorizer( binary = False,  ngram_range=(1,3))

In [75]:
# Convert machine/human text to vector seperately
comm_machine_vectors = tfIdf.fit_transform(comm_machine['Body_cleaned'].tolist())
comm_human_vectors = tfIdf.fit_transform(comm_human['Body_cleaned'].tolist())

In [77]:
# Determine X and y
X = df.loc[:,"Body_cleaned"]
y = df.loc[:,"Label"]

In [78]:
# Vectorize X
x_vec = tfIdf.fit_transform(X)

# **Machine Learning**

In [79]:
# Split train and test data
x_train_vec, x_test_vec, y_train, y_test = train_test_split(x_vec, y, test_size=0.2, random_state=0)

In [80]:
# Set model
logisticRegression = LogisticRegression()

In [82]:
logisticRegression.fit(x_train_vec, y_train)

In [88]:
# Save the models
pickle.dump(logisticRegression, open("Log_trained_model.pkl", "wb"))
print("Logistic Regression Model Saved")

Logistic Regression Model Saved


In [86]:
pickle.dump(tfIdf, open("Tfidf- Vectorizer", "wb"))
print("Tfidf Model Saved")

Tfidf Model Saved


# **Model Report**

| Metric        | Explanation                                                                     |
| ------------- | ------------------------------------------------------------------------------- |
| **Precision** | Of the samples predicted as this class, how many were actually correct?         |
| **Recall**    | Of the actual samples in this class, how many did the model correctly identify? |
| **F1-score**  | Harmonic mean of precision and recall – balances both.                          |
| **Support**   | Number of actual samples for this class.                                        |


In [87]:
# Check model's metrics
from sklearn.metrics import classification_report

y_pred = logisticRegression.predict(x_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.94      0.78       243
           1       0.89      0.53      0.66       236

    accuracy                           0.74       479
   macro avg       0.78      0.73      0.72       479
weighted avg       0.78      0.74      0.73       479



🔍 Detailed Interpretation

🔹 Class 0 (AI texts):

    Precision: 0.67 → 67% of texts predicted as AI were correct.

    Recall: 0.94 → The model correctly identified 94% of all actual AI texts.

    F1-score: 0.78 → A good balance between precision and recall.

👉 This suggests the model is very good at catching AI texts, but sometimes misclassifies human texts as AI (hence lower precision).


🔹 Class 1 (Human texts):

    Precision: 0.89 → 89% of texts predicted as human were actually human.

    Recall: 0.53 → But it only managed to detect 53% of all real human texts.

👉 The model struggles to detect human-written texts and likely mislabels many of them as AI.