In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import Dataset
import torch
from huggingface_hub import login, HfApi
import os

In [2]:

# Authenticate with your Hugging Face account
# Replace 'your_token_here' with your actual Hugging Face token
login(token='')

In [11]:
# Step 1: Load a pre-trained model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")  # Use GPU if available

# Step 2: Define a function to generate synthetic data
def generate_synthetic_data(prompt, num_samples=10, max_length=512):
    synthetic_data = []
    for _ in range(num_samples):
        try:
            # Generate the abstract (response)
            inputs = tokenizer(prompt, return_tensors="pt", max_length=max_length, truncation=True).to(model.device)
            outputs = model.generate(**inputs, max_length=max_length, temperature=1.2, top_p=0.9)
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Generate a title for the abstract
            title_prompt = f"Generate a concise title for the following abstract: {response}"
            title_inputs = tokenizer(title_prompt, return_tensors="pt", max_length=max_length, truncation=True).to(model.device)
            title_outputs = model.generate(**title_inputs, max_length=50, temperature=0.7, top_p=0.9)
            title = tokenizer.decode(title_outputs[0], skip_special_tokens=True)

            # Append the data with prompt, response, and title
            synthetic_data.append({
                "prompt": prompt,
                "response": response,
                "title": title
            })
        except Exception as e:
            print(f"Error generating sample {_}: {e}")
    return synthetic_data

# Step 3: Generate synthetic data (e.g., research paper abstracts)
prompt = "Generate an arXiv abstract of an NLP research paper. Return just the abstract, no titles."
synthetic_data = generate_synthetic_data(prompt, num_samples=10)

# Step 4: Convert the synthetic data to a Hugging Face Dataset
dataset = Dataset.from_list(synthetic_data)

# Step 5: Add dataset metadata (optional but recommended)
dataset = dataset.with_format("torch")

# Step 6: Push the dataset to the Hugging Face Hub
repo_name = "emredeveloper/synthetic-arxiv-abstracts-v1"  # Use the correct repository name
dataset.push_to_hub(repo_name, private=False)  # Set private=True if you want to keep it private

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/emredeveloper/synthetic-arxiv-abstracts-v1/commit/963581443d535083892ee87f10d1be3e6b218118', commit_message='Upload dataset', commit_description='', oid='963581443d535083892ee87f10d1be3e6b218118', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/emredeveloper/synthetic-arxiv-abstracts-v1', endpoint='https://huggingface.co', repo_type='dataset', repo_id='emredeveloper/synthetic-arxiv-abstracts-v1'), pr_revision=None, pr_num=None)

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
import torch
from huggingface_hub import login

# Authenticate and configure
login(token='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')

# Load model with optimized configuration
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    pad_token="<|endoftext|>",
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
)

def generate_synthetic_data(prompt, num_samples=5):  # Örnek sayısını azalttık
    synthetic_data = []
    
    # Optimize prompt processing
    base_inputs = tokenizer(
        prompt, 
        return_tensors="pt",
        max_length=1024,  # Uzunluğu kısalttık
        truncation=True
    ).to(model.device)

    # Generation parameters for speed
    gen_kwargs = {
        "temperature": 0.8,
        "top_p": 0.95,
        "do_sample": True,
        "pad_token_id": tokenizer.pad_token_id,
        "num_return_sequences": 1,
        "use_cache": True  # Önbelleği etkinleştirdik
    }

    for i in range(num_samples):
        try:
            # Faster abstract generation
            abstract_outputs = model.generate(
                **base_inputs,
                max_new_tokens=256,  # Token sayısını azalttık
                min_new_tokens=150,
                **gen_kwargs
            )
            abstract = tokenizer.decode(abstract_outputs[0], skip_special_tokens=True)

            # Optimized title generation
            title_prompt = f"Title for: {abstract[:500]}"  # Girişi kısalttık
            title_inputs = tokenizer(
                title_prompt,
                return_tensors="pt",
                max_length=256,
                truncation=True
            ).to(model.device)

            title_outputs = model.generate(
                **title_inputs,
                max_new_tokens=30,  # Daha kısa başlıklar
                min_new_tokens=10,
                **gen_kwargs
            )
            title = tokenizer.decode(title_outputs[0], skip_special_tokens=True).split(":")[-1].strip()

            synthetic_data.append({
                "prompt": prompt,
                "response": abstract,
                "title": title
            })

        except Exception as e:
            print(f"Error generating sample {i}: {str(e)[:200]}")

    return synthetic_data

# Generate data with shorter prompt
prompt = """Generate NLP research abstract about ML text analysis. Include methods and results."""

synthetic_data = generate_synthetic_data(prompt, num_samples=5)  # Daha az örnek

# Save and push
dataset = Dataset.from_list(synthetic_data)
dataset.push_to_hub("emredeveloper/synthetic-arxiv-abstracts-v1", private=False)

# Save locally
save_path = "./deepseek-1.5B-local"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved locally at {save_path}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/378 [00:00<?, ?B/s]

Model saved locally at ./deepseek-1.5B-local


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
import torch
import random
from huggingface_hub import login

# Hugging Face kimlik doğrulaması (dataset push için)
login(token='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')

# Yerel model yolu
local_model_path = "./deepseek-1.5B-local"

# Model ve Tokenizer'ı yükle
tokenizer = AutoTokenizer.from_pretrained(
    local_model_path,
    pad_token="<|endoftext|>",
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

def generate_natural_queries(num_queries=5):
    """Doğal görünen kullanıcı sorguları oluştur"""
    templates = [
        "How to analyze {} using machine learning? Need methods and results.",
        "Comparing {} techniques in NLP: which works better?",
        "Best approach for {} analysis? Include practical examples.",
        "Can you explain {} methods in simple terms? Need academic references.",
        "What's the latest research on {}? Focus on real-world applications."
    ]
    
    topics = [
        "text classification", 
        "topic modeling", 
        "sentiment analysis",
        "fake news detection",
        "document clustering"
    ]
    
    return [t.format(random.choice(topics)) for t in random.sample(templates, num_queries)]

def generate_abstract(prompt):
    """Tek bir özet oluşturma fonksiyonu"""
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=300,
        min_new_tokens=150,
        temperature=0.85,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def create_dataset(num_samples=5):
    dataset = []
    queries = generate_natural_queries(num_samples)
    
    for query in queries:
        try:
            abstract = generate_abstract(query)
            
            # Başlık üretimi
            title_response = model.generate(
                **tokenizer(
                    f"Generate a research title for: {abstract[:100]}",
                    return_tensors="pt"
                ).to(model.device),
                max_new_tokens=40,
                temperature=0.7
            )
            
            title = tokenizer.decode(title_response[0], skip_special_tokens=True)
            title = title.split("Title:")[-1].strip()
            
            dataset.append({
                "user_query": query,
                "generated_abstract": abstract,
                "suggested_title": title
            })
            
        except Exception as e:
            print(f"Hata oluştu: {str(e)[:200]}")
    
    return Dataset.from_dict({k: [dic[k] for dic in dataset] for k in dataset[0]})

# Veri setini oluştur ve kaydet
dataset = create_dataset(5)
dataset.push_to_hub("emredeveloper/synthetic-arxiv-abstracts-v1")

print("Örnek çıktı:")
print(dataset[0])