In [None]:
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_fixed

@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
def fetch_page(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def extract_hrefs_from_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    articles = soup.find_all('article', id=True)
    return [article.find('a')['href'] for article in articles if article.find('a')]

def extract_blog_post_data(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    # Extract title
    title_tag = soup.find('h1', class_='entry-title')
    title = title_tag.get_text(strip=True) if title_tag else ""

    # Extract date in YYYY-MM-DD format
    time_tag = soup.find('time', class_='entry-date published')
    post_date = time_tag['datetime'].split('T')[0] if time_tag and 'datetime' in time_tag.attrs else ""

    # Extract content
    content_div = soup.find('div', class_='entry-content')
    content_lines = []

    if content_div:
        for element in content_div.descendants:
            if element.name == 'p':
                text = element.get_text(separator=" ", strip=True)
                if text:
                    content_lines.append(text)
            elif element.name == 'li':
                li_text = element.get_text(separator=" ", strip=True)
                if li_text:
                    content_lines.append(f"- {li_text}")

    content = '\n'.join(content_lines)

    # Combine into dict
    result = {
        "title": title,
        "date": post_date,
        "content": content
    }

    return result

In [None]:
all_hrefs = []

for page_num in range(1, 192):
    url = f"https://nik.art/page/{page_num}"
    if page_num % 10 == 0:
        print(f'Current page: {page_num}')
    html = fetch_page(url)
    hrefs = extract_hrefs_from_page(html)


    all_hrefs.extend(hrefs)
    page_num += 1
    
unique_hrefs = set(all_hrefs)

In [46]:
all_posts = []

for href in list(unique_hrefs): 
    txt = fetch_page(href)
    post_data = extract_blog_post_data(txt)
    all_posts.append(post_data)

In [None]:
all_posts.sort(key=lambda x: x['date'])

In [2]:
import json

In [None]:
with open("all_posts.json", "w") as f:
    json.dump(all_posts, f, indent=4)

## Create jsonl training file

In [None]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from datetime import datetime

max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

EOS_TOKEN = tokenizer.eos_token

raw = load_dataset("json", data_files="all_posts.json", split="train")

# Training data: before 2025
train_data = raw.filter(lambda ex: ex["date"] < datetime(2025, 1, 1))
MAX_LEN = 4000
train_data = train_data.filter(lambda ex: len(tokenizer.tokenize(ex["content"])) <= MAX_LEN)

# Validation data: 2025 and onward
val_data = raw.filter(lambda ex: ex["date"] >= datetime(2025, 1, 1))
val_data = val_data.filter(lambda ex: len(tokenizer.tokenize(ex["content"])) <= MAX_LEN)

def join_title_body(ex):
    return {"text": f"### Title: {ex['title']}\n\n### Content: \n{ex['content']}" + EOS_TOKEN}

# Prepare and save training data
train_dataset = train_data.map(join_title_body, remove_columns=train_data.column_names)
train_dataset.to_json("training_data_before_2025.jsonl", orient="records", lines=True)

# Prepare and save validation data
val_dataset = val_data.map(join_title_body, remove_columns=val_data.column_names)
val_dataset.to_json("val_data_2025_onward.jsonl", orient="records", lines=True)