# Quickstart

In [1]:
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from textdefendr.data import load_dataset
from textdefendr.encoder import TextEncoder

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
df = load_dataset()
df = df.sample(1000, random_state=42)
df

Unnamed: 0,text,perturbed,attack_name
0,"Magnifique épopée, une belle histoire, touchan...",False,clean
1,Je n'ai pas aimé mais pourtant je lui mets 2 é...,False,clean
2,Un dessin animé qui brille par sa féerie et se...,False,clean
3,"Si c'est là le renouveau du cinéma français, c...",False,clean
4,Et pourtant on s’en Doutait !Second volet très...,False,clean
...,...,...,...
28908,"Disons-le tout net, Quelques heures la printem...",True,textfooler
28909,"A la sortie de ce film, un seul feeling m'a en...",True,textfooler
28910,"""Quelque chose ne vas pas chez Esther"". Effect...",True,textfooler
28911,Un gros carton des années 70's . Quand Annie G...,True,textfooler


In [5]:
X = df["text"]
y = df["perturbed"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
encoder = TextEncoder(
    enable_tp=True, enable_lm_perplexity=True, enable_lm_proba=True, device=device
)

In [8]:
X_train_encoded = encoder.fit_transform(X_train)

  0%|          | 0/3 [00:00<?, ?it/s]

Encoding text properties with sentence-transformers/bert-base-nli-mean-tokens...
Encoding perplexity with gpt2...
Encoding proba and rank with roberta-base...


In [9]:
clf = LogisticRegression(random_state=42)
clf.fit(X_train_encoded, y_train)

In [10]:
X_test_encoded = encoder.transform(X_test)
clf.score(X_test_encoded, y_test)

  0%|          | 0/3 [00:00<?, ?it/s]

Encoding text properties with sentence-transformers/bert-base-nli-mean-tokens...
Encoding perplexity with gpt2...
Encoding proba and rank with roberta-base...


0.745