<a href="https://colab.research.google.com/github/genaforvena/the_soft_delerizome_machine_a_thousand_guattaris/blob/master/the_soft_delerizome_machine_a_thousand_guattaris.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
# @title Install Necessary Libraries

# %%
!pip install transformers[torch] datasets gradio --upgrade



In [51]:
# @title Import Libraries

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
import requests
from bs4 import BeautifulSoup
import re
import string
import os
import gradio as gr

In [52]:
# @title Define Text Cleaning Function

# %%
from bs4 import BeautifulSoup

def clean_text(text):
  """Cleans text by removing HTML tags, everything except lowercase English words and punctuation."""
  soup = BeautifulSoup(text, "html.parser")  # Parse HTML
  text = soup.get_text()  # Extract text content
  text = text.lower()  # Convert to lowercase
  text = re.sub(r"[^a-z\s\.,!?;:'\"-]+", " ", text)  # Keep only English letters, spaces, and punctuation
  text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
  return text

In [53]:
# @title Define Web Scraping Function

# %%
def scrape_text_from_url(url):
  """Scrapes text from a given URL."""
  try:
    response = requests.get(url)
    if url.endswith(".txt"):  # Plain text files
      return clean_text(response.text)
    else:  # HTML files
      soup = BeautifulSoup(response.text, "html.parser")
      paragraphs = soup.find_all("p")
      text = " ".join(p.get_text() for p in paragraphs)
      return clean_text(text)
  except Exception as e:
    print(f"Error scraping {url}: {e}")
    return ""

In [54]:
# @title Define Text URLs and Output File

# %%
text_urls = [
    "https://archive.org/stream/anti-oedipus/anti-oedipus-fixed_djvu.txt", # Anti-Oedipus
    "https://archive.org/stream/AThousandPlateaus_20180115/A-Thousand-Plateaus_djvu.txt" # A Thousand Plateaus
]
output_file = "philosophy.txt"

In [55]:
# @title Scrape and Combine Texts

# %%
all_texts = []
for url in text_urls:
  print(f"Scraping {url}...")
  text = scrape_text_from_url(url)
  if text:
    all_texts.append(text)

with open(output_file, "w", encoding="utf-8") as outfile:
  for text in all_texts:
    outfile.write(text)
    outfile.write("\n\n")

print(f"All texts have been combined into {output_file}")

Scraping https://archive.org/stream/anti-oedipus/anti-oedipus-fixed_djvu.txt...
Scraping https://archive.org/stream/AThousandPlateaus_20180115/A-Thousand-Plateaus_djvu.txt...
All texts have been combined into philosophy.txt


In [56]:
# @title Load the Fine-tuned Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [57]:
# @title Create Dataset
import torch
from datasets import Dataset

def split_text_into_chunks(text, chunk_size=512, overlap=128):
  """Splits a long text into smaller chunks with optional overlap."""
  chunks = []
  for i in range(0, len(text) - chunk_size + 1, chunk_size - overlap):
    chunks.append(text[i : i + chunk_size])
  return chunks

# Load the text data
with open("philosophy.txt", "r", encoding="utf-8") as f:
  text = f.read()

# Split the text into chunks
chunks = split_text_into_chunks(text)

# Create a Dataset from the chunks
dataset = Dataset.from_dict({"text": chunks})

# Tokenize the dataset (similar to your existing code)
def tokenize_function(examples):
  return tokenizer(
      examples["text"],
      padding="max_length",
      truncation=True,
      max_length=512,  # Adjust as needed
      return_tensors="pt",
  )

tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets = tokenized_datasets.map(
    lambda examples: {"labels": examples["input_ids"]}, batched=True
)
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/7927 [00:00<?, ? examples/s]

Map:   0%|          | 0/7927 [00:00<?, ? examples/s]

In [58]:
# @title Load and Fine-tune Model

# %%
model = GPT2LMHeadModel.from_pretrained("gpt2")

training_args = TrainingArguments(
    output_dir="./soft_delerizome_machine",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate=2e-5,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

trainer.train()

Step,Training Loss
500,0.9907
1000,0.8851
1500,0.8738
2000,0.8564
2500,0.8258
3000,0.8265
3500,0.814
4000,0.8104
4500,0.7926
5000,0.7928


TrainOutput(global_step=9910, training_loss=0.8086238949618835, metrics={'train_runtime': 6969.6828, 'train_samples_per_second': 5.687, 'train_steps_per_second': 1.422, 'total_flos': 1.035630968832e+16, 'train_loss': 0.8086238949618835, 'epoch': 5.0})

In [59]:
#  @title Save Fine-tuned Model

# %%
model.save_pretrained("soft_delerizome_machine")
tokenizer.save_pretrained("soft_delerizome_machine")

('soft_delerizome_machine/tokenizer_config.json',
 'soft_delerizome_machine/special_tokens_map.json',
 'soft_delerizome_machine/vocab.json',
 'soft_delerizome_machine/merges.txt',
 'soft_delerizome_machine/added_tokens.json')

In [60]:
#  @title Load Fine-tuned Model

# %%
model = GPT2LMHeadModel.from_pretrained("soft_delerizome_machine")
tokenizer = GPT2Tokenizer.from_pretrained("soft_delerizome_machine")
tokenizer.pad_token = tokenizer.eos_token

In [61]:
# @title Define Text Generation Function

# %%
def generate_text(prompt):
  """Generates text using the fine-tuned model."""
  inputs = tokenizer(prompt, return_tensors="pt", padding=True)
  outputs = model.generate(
      inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=150,
      num_return_sequences=1,
      do_sample=True,
      temperature=0.8,
      top_k=50,
      pad_token_id=tokenizer.eos_token_id
  )
  return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [62]:
# @title Generate Text

# %%
prompt = "it is not a projection;"
print(generate_text(prompt))

it is not a projection; everything is a projection. this is the nature of the artifice. it is not an expression of a given power, but rather a mechanism of production, production of the organs, and production of the material, as such. the artifice, or production itself, is a projection of the production of the unconscious, by the repression of the production of this unconscious in conformity with the power of the despot, the repression of desire in conformity with the power of the state in conformity with the power of the social machine. the unconscious, instead, has the


In [63]:
#  @title Launch Gradio Interface

# %%
iface = gr.Interface(fn=generate_text, inputs="text", outputs="text")
iface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://737bc59337a7e750dd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


