In [1]:
%pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m29.0 MB/s[0m  [33m0:00:00[0mm0:00:01[0m
[?25hDownloading numpy-2.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m74.2 MB/s[0m  [33m0:00:00[0m6m0:00:01[0m

In [2]:
from pathlib import Path
import pandas as pd

In [None]:
#Import BOW vectorizer for preprocessing

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#Import TF-IDF vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
from sklearn.metrics import classification_report

In [3]:
# path of notebook
path = Path.cwd()

data_path = path.parents[1] / "resources" / "data" / "raid" / "train_none.csv"

raw_df = pd.read_csv(data_path)

In [4]:
print(len(raw_df))

467985


We've loaded our data so now lets inspect it 

Lets have a look at the variables (column names) and print some examples of text from the data

In [5]:
print(raw_df.columns)

Index(['id', 'adv_source_id', 'source_id', 'model', 'decoding',
       'repetition_penalty', 'attack', 'domain', 'title', 'prompt',
       'generation'],
      dtype='object')


Here is an overview of what each column contains

1. `id`: A uuid4 that uniquely identifies the original content of the generation
2. `adv_source_id`: uuid4 of the source of the generation if adversarial
3. `source_id`: uuid4 of the human-written source text
4. `model`: The model that generated the text
   - Choices: `['chatgpt', 'gpt4', 'gpt3', 'gpt2', 'llama-chat', 'mistral', 'mistral-chat', 'mpt', 'mpt-chat', 'cohere', 'cohere-chat']`
5. `decoding`: The decoding strategy used 
    - Choices: `['greedy', 'sampling']`
6. `repetition_penalty`: Whether or not we use a repetition penalty of 1.2 when generating
    - Choices: `['yes', 'no']`
7. `attack`: The adversarial attack used
    - Choices: `['homoglyph', 'number', 'article_deletion', 'insert_paragraphs', 'perplexity_misspelling', 'upper_lower', 'whitespace', 'zero_width_space', 'synonym', 'paraphrase', 'alternative_spelling']`
8. `domain`: The genre from where the prompt/text was taken
    - Choices: `['abstracts', 'books', 'code', 'czech', 'german', 'news', 'poetry', 'recipes', 'reddit', 'reviews', 'wiki']`
9. `title`: The title of the article used in the prompt
10. `prompt`: The prompt used to generate the text
11. `generation`: The text of the generation

- from https://huggingface.co/datasets/liamdugan/raid#data-fields 

In [7]:
#Find unique values in model column

# Get the unique values of 'B' column
unique_values = raw_df['model'].unique()

# Print the unique values
print("\nUnique values in 'models' column:")
print(unique_values)


Unique values in 'models' column:
['human' 'llama-chat' 'mpt' 'mpt-chat' 'gpt2' 'mistral' 'mistral-chat'
 'gpt3' 'cohere' 'chatgpt' 'gpt4' 'cohere-chat']


In [13]:
#Before we print some example texts lets take a look at the samplesize for each model and for humans

raw_df.groupby("model").size()

model
chatgpt         26742
cohere          26742
cohere-chat     26742
gpt2            53484
gpt3            26742
gpt4            26742
human           13371
llama-chat      53484
mistral         53484
mistral-chat    53484
mpt             53484
mpt-chat        53484
dtype: int64

In [10]:
#Print raw texts

print(f"### Generation 77 by {raw_df["model"][77]} ###")
print(raw_df["generation"][77])

print(f"### Generation 6000 by {raw_df["model"][6000]} ###")
print(raw_df["generation"][6000])

print(f"### Generation 10000 by {raw_df["model"][10000]} ###")
print(raw_df["generation"][10000])

### Generation 77 by human ###
Computer vision tasks such as semantic segmentation perform very well in good
weather conditions, but if the weather turns bad, they have problems to achieve
this performance in these conditions. One possibility to obtain more robust and
reliable results in adverse weather conditions is to use video-segmentation
approaches instead of commonly used single-image segmentation methods.
Video-segmentation approaches capture temporal information of the previous
video-frames in addition to current image information, and hence, they are more
robust against disturbances, especially if they occur in only a few frames of
the video-sequence. However, video-segmentation approaches, which are often
based on recurrent neural networks, cannot be applied in real-time applications
anymore, since their recurrent structures in the network are computational
expensive. For instance, the inference time of the LSTM-ICNet, in which
recurrent units are placed at proper positions i

We want to do a binary classification of human versus the LLM cohere which we also call our classes in ML. 

Let’s subset the data to only include these, removing all other models.

In [11]:
#.isin is used to subset the data

df = raw_df[raw_df["model"].isin(["human", "cohere"])]

In [12]:
#lets take a look at the distribution of samples or texts for humans and the model cohere

df.groupby("model").size()

model
cohere    26742
human     13371
dtype: int64

Hmm, we seem to have double the amount of cohere generations as human texts. Do you know why that might be a problem?

My answer: I think the classifier uses the probability of a certain class, as well as the words within a document
to classify an unseen document. This means that if the training data is skewed so that one type of document is more common,
then the classifier will overestimate the probability of finding that typw of document, if the unseen data does not have the same distribution

Mina's answer: Classification models generally assume that all classes in a dataset have roughly the same number of examples. Unbalanced classes can cause the classifier to perform poorly on the under-represented class, also called the minority class (see Taskiran et al. (2025)). 

We want our classifier to predict human or cohere, but it doesn’t really understand these labels.

We’ll add the label column is_human and assign 1 if the row model is human and 0 if it anything else (such as cohere):

In [15]:
df["is_human"] = df["model"].apply(lambda x: 1 if x == "human" else 0)

#lets check if it worked

df.groupby("is_human").size()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["is_human"] = df["model"].apply(lambda x: 1 if x == "human" else 0)


is_human
0    26742
1    13371
dtype: int64

Create Training Splits!

When training a ML model, we need to consider three splits of the data:

Training data: to train models on

Validation data: to compare models on

Test data: to test model accuracy on

Common percentage splits for train, val, and test are:

60%, 20%, 20%

70%, 15%, 15%

Since raid has a seperate test set, we only need to split our data into train and val

to do this we will install scikit-learn
scikit-learn is a widely used Python library for machine learning, covering supervised and unsupervised learning, model evaluation, and preprocessing. We will use it here for preprocessing.

Firstly, let’s install scikit-learn:

In [16]:
%pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (62 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m31.1 MB/s[0m  [33m0:00:00[0mm0:00:01[0m
[?25hDownloading joblib-1.5.2-py3-none-any.whl (308 kB)
Downloading scipy-1.16.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [17]:
from sklearn.model_selection import train_test_split

Instead of showing you the code this time, I’ll ask you to check documentation and this guide to:

Use the function train_test_split() to split your df into train_df and val_df. The size of our validation set should be 20 % and remember to:

Set a random_state to ensure reproducibility!

Set stratify to ensure proper class shuffling!

Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

Guide: https://medium.com/@whyamit404/understanding-train-test-split-in-pandas-eb1116576c66

Here is how we do it and how the function works

test_size=0.2 → This means 20% of your data will be used for testing, while 80% will be used for training.
If you have 1000 rows, test_size=0.2 means 200 rows go to the test set, and 800 rows go to training.

random_state=42 → Sets a fixed seed for randomness.
Ever tried shuffling a deck of cards and getting a different order every time? That’s randomness. By setting random_state, we ensure the data split remains consistent every time you run the code.

stratify= df['is_human'] → IM NOT SURE WHAT THIS DOES SO ASK

train_test_split(df, test_size=0.2, random_state=42, stratify= df['is_human'])
This function randomly shuffles the data and splits it into train (80%) and test (20%) sets.

In [18]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify= df['is_human'])

# Vectorize Text with BOW (bag of words) and TF-IDF (term frequency inverse document frequency)

Instantiate a CountVectorizer object. We set lowercase = True as we did not do this preprocessing ourselves :). We also limit our amount of features to only the top 500 features to reduce the computational costs:

In [None]:
vectorizer = CountVectorizer(lowercase=True, max_features=500)

We select our text column "generation" from our training split. We fit the vectorizer and transform our text with .fit_transform:

In [21]:
X_train_bow = vectorizer.fit_transform(train_df["generation"])

For our validation split, we’ll use transform method (we only fit our vectorizer to training data!)!

In [22]:
X_val_bow = vectorizer.transform(val_df["generation"])

In [None]:
#Let’s print the first few features and the first vector

print("Features:", vectorizer.get_feature_names_out()[:30])

print("\nTraining set:")
print(X_train_bow.toarray()[0])

 #As we can see, there are many features that are aren’t present (the many 0’s).

Features: ['000' '10' '12' '15' '20' '30' 'able' 'about' 'action' 'add' 'added'
 'after' 'again' 'against' 'age' 'all' 'almost' 'along' 'also' 'always'
 'am' 'american' 'an' 'and' 'another' 'any' 'anything' 'approach' 'are'
 'around']

Training set:
[0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 3 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0 0 3 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 5 1 0 0 1 0 2 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2 2 1 0 2 0 0 0 0 0 0 0 0 0 0 0
 0 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


Lets do the same process but with a different vectorizer the TF-IDF vectorize

In [None]:
vectorizer = TfidfVectorizer(lowercase=True, max_features=500)

In [26]:
X_train_TFIDF = TFIDFvectorizer.fit_transform(train_df["generation"])

X_val_TFIDF = TFIDFvectorizer.transform(val_df["generation"])

Define a function called vectorize that:

Takes X_train and X_val as text columns (e.g., X_train = train_df["generation"]).

Accepts vec_type, where you can choose either "bow" or "tf-idf" and it will select CountVectorizer and TfidfVectorizer respectively.

You can do this with an if and elifstatement

Includesmax_featuresparameter to control the vectorizer’smax_features (like we have done above).

Returns X_train_vectorized and X_val_vectorized

When you have your function, run it once to create X_train_bowand X_val_bow and a second time to create X_train_tfidf, X_val_tfidf

In [27]:
from typing import Literal 
# using Literal is not strictly necessary, 
# but is a way to define the options that you can use for the function!

def vectorize(X_train: pd.Series, X_val: pd.Series, vec_type:Literal["bow", "tf-idf"], max_features:int=500):
    """
    Function to vectorize train and val data! 
    """
    if vec_type == "bow":
        vectorizer = CountVectorizer(lowercase=True, max_features=max_features)
    elif vec_type== "tf-idf":
        vectorizer = TfidfVectorizer(lowercase=True, max_features=max_features)
    else: 
        # this is good code practice, but if your function has no 'else' statement, this is also fine!
        raise ValueError(f"Invalid vec_type: {vec_type}. Must be either 'bow' or 'tf-idf")
    
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_val_vectorized = vectorizer.transform(X_val)

    return X_train_vectorized, X_val_vectorized


# APPLY TO GET TF-IDF
X_train_tfidf, X_val_tfidf = vectorize(
                        X_train = train_df["generation"], 
                          X_val = val_df["generation"],
                          vec_type="tf-idf",
                          max_features=500
                          )

# 2.3 Classification
We are ready to do our binary classification (human versus cohere) using a simple logistic regression. Let’s import it from scikit-learn

(We could also technically use a Naive Bayes classifier, using GaussianNB for BOW and MultinomialNB for TF-IDF.)

Like with the Countervectorizer, we can instantiate LogisticRegression as our classifier clf_1:

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
# clf is common naming convention
clf_1 = LogisticRegression(
    random_state=42,
    solver="liblinear",   # tested all solvers, and this one was fastest ... (should also be appropriate for our data)
    max_iter=1000,        # allow more iterations
    C=1.0,                # adjust if needed (smaller values = stronger regularization)
)

In [30]:
#Let’s extract our numerical labels as Y:

y_train = train_df["is_human"]

#Let’s fit our classifier on our vectorized text X_train_bow and our y_train:

clf_1.fit(X_train_bow, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


# 2.4 Evaluation
Now that we have fit our classifier, we should evaluate it’s performance on X_val_bow and y_val. Start by importing classification_report from scikit-learn (at the top of your notebook)

In [33]:
#Let’s extract predicted labels y:

y_pred_bow = clf_1.predict(X_val_bow)

#Now, we can compare to our actual labels y val:

y_val_bow = val_df["is_human"]
report = classification_report(y_val_bow, y_pred_bow) 
print(report)

              precision    recall  f1-score   support

           0       0.80      0.90      0.85      5349
           1       0.74      0.55      0.63      2674

    accuracy                           0.78      8023
   macro avg       0.77      0.73      0.74      8023
weighted avg       0.78      0.78      0.78      8023



Lets do the same thing with TF-IDF

In [34]:
# clf is common naming convention
clf_2 = LogisticRegression(
    random_state=42,
    solver="liblinear",   # tested all solvers, and this one was fastest ... (should also be appropriate for our data)
    max_iter=1000,        # allow more iterations
    C=1.0,                # adjust if needed (smaller values = stronger regularization)
)

In [35]:
#Let’s extract our numerical labels as Y:

y_train = train_df["is_human"]

#Let’s fit our classifier on our vectorized text X_train_bow and our y_train:

clf_2.fit(X_train_tfidf, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [38]:
#Let’s extract predicted labels y:

y_pred_tfidf = clf_2.predict(X_val_tfidf)

#Now, we can compare to our actual labels y val:

y_val_tfidf = val_df["is_human"]
report_tfidf = classification_report(y_val_tfidf, y_pred_tfidf)
print(report_tfidf, report)

              precision    recall  f1-score   support

           0       0.80      0.91      0.85      5349
           1       0.74      0.55      0.63      2674

    accuracy                           0.79      8023
   macro avg       0.77      0.73      0.74      8023
weighted avg       0.78      0.79      0.78      8023
               precision    recall  f1-score   support

           0       0.80      0.90      0.85      5349
           1       0.74      0.55      0.63      2674

    accuracy                           0.78      8023
   macro avg       0.77      0.73      0.74      8023
weighted avg       0.78      0.78      0.78      8023

