In [1]:
from flask import Flask, Response, request, render_template_string
from transformers import TextStreamer, StoppingCriteria
from unsloth import FastLanguageModel
import torch
import queue
import threading

app = Flask(__name__)

# Model ve tokenizer'ı yükle
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="C:\\Users\\Mehmet\\Desktop\\Denizhan2\\model_egitim\\checkpoint-1500",
    max_seq_length=2048,
    dtype=torch.bfloat16,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)

# Thread yönetimi için global değişkenler
current_stop_event = None
stop_lock = threading.Lock()

class StopGenerationCriteria(StoppingCriteria):
    def __init__(self, stop_event):
        super().__init__()
        self.stop_event = stop_event

    def __call__(self, input_ids, scores, **kwargs):
        return self.stop_event.is_set()

class WebStreamer(TextStreamer):
    def __init__(self, tokenizer, queue, **kwargs):
        super().__init__(tokenizer, **kwargs)
        self.queue = queue

    def on_finalized_text(self, text: str, stream_end: bool = False):
        self.queue.put(text)
        if stream_end:
            self.queue.put(None)

@app.route('/')
def index():
    return render_template_string('''
<!DOCTYPE html>
<html lang="tr">
<head>
  <meta charset="UTF-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>ChatGPT - Chatbot</title>
  <style>
    body {
      margin: 0;
      font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
      background-color: #f7f7f8;
      display: flex;
      flex-direction: column;
      height: 100vh;
      color: #333;
    }
    .container {
      display: flex;
      flex-direction: column;
      flex-grow: 1;
      max-width: 800px;
      margin: 0 auto;
      padding: 20px;
    }
    .header {
      text-align: center;
      margin-bottom: 10px;
      font-size: 24px;
      font-weight: bold;
    }
    .chat-area {
      flex-grow: 1;
      overflow-y: auto;
      padding: 20px;
      background: #fff;
      border-radius: 8px;
      box-shadow: 0 0 10px rgba(0,0,0,0.05);
    }
    .message {
      display: block;
      padding: 10px 15px;
      margin: 10px 0;
      border-radius: 8px;
      max-width: 75%;
      line-height: 1.5;
      white-space: pre-wrap;
      word-wrap: break-word;
    }
    .message.user {
      background: #dcf8c6;
      align-self: flex-end;
    }
    .message.ai {
      background: #e1e1e1;
      align-self: flex-start;
    }
    .input-container {
      display: flex;
      margin-top: 10px;
      background: #fff;
      border-radius: 8px;
      box-shadow: 0 0 10px rgba(0,0,0,0.05);
      padding: 10px;
    }
    .input-container textarea {
      flex-grow: 1;
      border: none;
      resize: none;
      font-size: 16px;
      padding: 10px;
      outline: none;
    }
    .input-container button {
      background: #007bff;
      border: none;
      color: white;
      padding: 10px 20px;
      border-radius: 8px;
      margin-left: 10px;
      cursor: pointer;
      font-size: 16px;
    }
    .input-container button:hover {
      background: #0056b3;
    }
  </style>
</head>
<body>
  <div class="container">
    <div class="header">ChatGPT</div>
    <div id="chatArea" class="chat-area"></div>
    <div class="input-container">
      <textarea id="prompt" rows="2" placeholder="Sorunuzu buraya yazın..."></textarea>
      <button onclick="sendPrompt()">Gönder</button>
    </div>
  </div>

  <script>
    let currentEventSource = null;

    function sendPrompt() {
      const prompt = document.getElementById('prompt').value.trim();
      if (!prompt) return;

      const chatArea = document.getElementById('chatArea');

      // Kullanıcının mesajını ekle
      const userMessage = document.createElement('div');
      userMessage.classList.add('message', 'user');
      userMessage.textContent = prompt;
      chatArea.appendChild(userMessage);
      chatArea.scrollTop = chatArea.scrollHeight;

      // Giriş alanını temizle
      document.getElementById('prompt').value = '';

      // Önceki EventSource varsa kapat
      if (currentEventSource) {
        currentEventSource.close();
      }

      // Yapay zeka mesajı için bir konteyner oluştur
      const aiMessage = document.createElement('div');
      aiMessage.classList.add('message', 'ai');
      chatArea.appendChild(aiMessage);
      chatArea.scrollTop = chatArea.scrollHeight;

      // Yeni EventSource oluştur
      currentEventSource = new EventSource(`/generate?prompt=${encodeURIComponent(prompt)}`);
      currentEventSource.onmessage = function(e) {
        if (e.data === 'DONE') {
          currentEventSource.close();
          return;
        }
        aiMessage.textContent += e.data;
        chatArea.scrollTop = chatArea.scrollHeight;
      };
    }
  </script>
</body>
</html>
''')

@app.route('/generate')
def generate():
    global current_stop_event
    with stop_lock:
        if current_stop_event:
            current_stop_event.set()
        current_stop_event = threading.Event()
        stop_event = current_stop_event

    prompt = request.args.get('prompt', '')
    response_queue = queue.Queue()

    streamer = WebStreamer(
        tokenizer=tokenizer,
        queue=response_queue,
        skip_prompt=True,
        skip_special_tokens=True
    )

    def generation_task():
        try:
            inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
            model.generate(
                **inputs,
                streamer=streamer,
                max_new_tokens=1024,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                stopping_criteria=[StopGenerationCriteria(stop_event)],
                repetition_penalty=1.2,  # Tekrar cezası eklendi
                temperature=0.7,         # Rastgelelik seviyesi
                top_p=0.9,               # Nükleus örnekleme
            )
        finally:
            response_queue.put(None)

    threading.Thread(target=generation_task).start()

    def event_stream():
        while True:
            chunk = response_queue.get()
            if chunk is None:
                yield "data: DONE\n\n"
                break
            yield f"data: {chunk}\n\n"

    return Response(event_stream(), mimetype="text/event-stream")

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, threaded=True)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA RTX A5000. Max memory: 23.988 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.
  self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
Loading checkpoint shards: 100%|██████████| 12/12 [00:30<00:00,  2.54s/it]
Unsloth 2025.2.15 patched 48 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.33.183:5000
Press CTRL+C to quit
192.168.33.183 - - [10/Mar/2025 10:07:23] "GET / HTTP/1.1" 200 -
192.168.33.183 - - [10/Mar/2025 10:07:23] "GET /favicon.ico HTTP/1.1" 404 -
192.168.33.183 - - [10/Mar/2025 10:07:34] "GET /generate?prompt=Mehmet%20AKINOL%20Kimdir%20?%20hakkında%20bilgi%20ver. HTTP/1.1" 200 -


In [1]:
#gerçek çalışan
from flask import Flask, Response, request, render_template_string
from transformers import TextStreamer, StoppingCriteria, BitsAndBytesConfig
from unsloth import FastLanguageModel
from peft import PeftModel
import torch
import queue
import threading

app = Flask(__name__)

# Quantization ayarları
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16,
    bnb_4bit_use_double_quant = True,
)

# Model ve tokenizer'ı yükle
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "C:\\Users\\Mehmet\\Desktop\\Denizhan2\\model_egitim\\checkpoint-1500",
    max_seq_length = 2048,
    dtype = torch.bfloat16,
    load_in_4bit = True,
    quantization_config = quantization_config,
    device_map = "auto",
)

# PEFT adaptörünü yükle
model = PeftModel.from_pretrained(model, "C:\\Users\\Mehmet\\Desktop\\Denizhan2\\model_egitim\\checkpoint-1500")
FastLanguageModel.for_inference(model)
model.eval()

# Thread yönetimi
current_stop_event = None
stop_lock = threading.Lock()

class StopGenerationCriteria(StoppingCriteria):
    def __init__(self, stop_event):
        super().__init__()
        self.stop_event = stop_event

    def __call__(self, input_ids, scores, **kwargs):
        return self.stop_event.is_set()

class WebStreamer(TextStreamer):
    def __init__(self, tokenizer, queue, **kwargs):
        super().__init__(tokenizer, **kwargs)
        self.queue = queue

    def on_finalized_text(self, text: str, stream_end: bool = False):
        self.queue.put(text)
        if stream_end:
            self.queue.put(None)

def format_alpaca_prompt(instruction):
    return f"""Aşağıda bir görevi açıklayan bir talimat bulunmaktadır. İsteği uygun şekilde tamamlayan bir yanıt yazın.

### Talimat:
{instruction}

### Yanıt:
"""

@app.route('/')
def index():
    return render_template_string('''
<!DOCTYPE html>
<html lang="tr">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Alpaca Chat</title>
  <style>
    /* Stil tanımları aynı kalıyor */
    .container { max-width: 800px; margin: 0 auto; padding: 20px; height: 100vh; display: flex; flex-direction: column; }
    .chat-area { flex-grow: 1; overflow-y: auto; background: #fff; border-radius: 8px; padding: 20px; margin: 10px 0; }
    .message { max-width: 75%; padding: 10px; margin: 5px 0; border-radius: 8px; }
    .user { background: #dcf8c6; margin-left: auto; }
    .ai { background: #f0f0f0; }
    .input-container { display: flex; gap: 10px; }
    textarea { flex-grow: 1; padding: 10px; border-radius: 8px; border: 1px solid #ddd; }
    button { background: #007bff; color: white; border: none; padding: 10px 20px; border-radius: 8px; cursor: pointer; }
  </style>
</head>
<body>
  <div class="container">
    <h1>Alpaca Chat</h1>
    <div id="chatArea" class="chat-area"></div>
    <div class="input-container">
      <textarea id="prompt" rows="3" placeholder="Alpaca modeli için sorunuzu girin..."></textarea>
      <button onclick="sendPrompt()">Gönder</button>
    </div>
  </div>

  <script>
    let currentEventSource = null;

    function sendPrompt() {
      const prompt = document.getElementById('prompt').value.trim();
      if (!prompt) return;

      const chatArea = document.getElementById('chatArea');
      
      // Kullanıcı mesajını ekle
      const userDiv = document.createElement('div');
      userDiv.className = 'message user';
      userDiv.textContent = prompt;
      chatArea.appendChild(userDiv);

      // AI mesaj konteyneri
      const aiDiv = document.createElement('div');
      aiDiv.className = 'message ai';
      chatArea.appendChild(aiDiv);
      
      // Önceki bağlantıyı kapat
      if(currentEventSource) currentEventSource.close();

      // Yeni istek başlat
      currentEventSource = new EventSource(`/generate?prompt=${encodeURIComponent(prompt)}`);
      
      currentEventSource.onmessage = (e) => {
        if(e.data === 'DONE') {
          currentEventSource.close();
          aiDiv.innerHTML += '<div style="color: #666; font-size: 0.8em">▼ Cevap Tamamlandı</div>';
          return;
        }
        aiDiv.textContent += e.data;
        chatArea.scrollTop = chatArea.scrollHeight;
      };
      
      document.getElementById('prompt').value = '';
      chatArea.scrollTop = chatArea.scrollHeight;
    }
  </script>
</body>
</html>
''')

@app.route('/generate')
def generate():
    global current_stop_event
    
    with stop_lock:
        if current_stop_event:
            current_stop_event.set()
        current_stop_event = threading.Event()
    
    stop_event = current_stop_event
    prompt = request.args.get('prompt', '')
    response_queue = queue.Queue()

    # Alpaca formatına dönüştür
    formatted_prompt = format_alpaca_prompt(prompt)
    
    # Tokenizer ayarları
    inputs = tokenizer(
        formatted_prompt,
        return_tensors = "pt",
        padding = True,
        truncation = True,
        max_length = 2048,
    ).to("cuda")

    # Streamer ayarları
    streamer = WebStreamer(
        tokenizer = tokenizer,
        queue = response_queue,
        skip_prompt = True,
        skip_special_tokens = True,
        clean_up_tokenization_spaces = True,
    )

    def generation_task():
        try:
            model.generate(
                **inputs,
                streamer = streamer,
                max_new_tokens = 2048,
                temperature = 0.6,
                top_p = 0.9,
                top_k = 40,
                do_sample = True,
                repetition_penalty = 1.15,
                eos_token_id = tokenizer.eos_token_id,
                pad_token_id = tokenizer.pad_token_id,
                stopping_criteria = [StopGenerationCriteria(stop_event)],
            )
        finally:
            response_queue.put(None)

    threading.Thread(target=generation_task).start()

    def stream():
        while True:
            chunk = response_queue.get()
            if chunk is None: 
                yield "data: DONE\n\n"
                break
            yield f"data: {chunk}\n\n"
    
    return Response(stream(), mimetype="text/event-stream")

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, threaded=True)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA RTX A5000. Max memory: 23.988 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.
  self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
Loading checkpoint shards: 100%|██████████| 12/12 [00:25<00:00,  2.13s/it]
Unsloth 2025.2.15 patched 48 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.33.183:5000
Press CTRL+C to quit
127.0.0.1 - - [08/Mar/2025 13:12:39] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [08/Mar/2025 13:12:39] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [08/Mar/2025 13:12:47] "GET /generate?prompt=Merhaba HTTP/1.1" 200 -
127.0.0.1 - - [08/Mar/2025 13:16:07] "GET /generate?prompt=Nasılsın? HTTP/1.1" 200 -
127.0.0.1 - - [08/Mar/2025 13:16:18] "GET /generate?prompt=Merhaba HTTP/1.1" 200 -
127.0.0.1 - - [08/Mar/2025 13:17:17] "GET /generate?prompt=Bu%20hafta%20Fenerbahçe%20maçları HTTP/1.1" 200 -
127.0.0.1 - - [08/Mar/2025 13:18:05] "GET /generate?prompt=Sağlık%20üzerine%20haber%20yaz HTTP/1.1" 200 -


In [2]:
from flask import Flask, Response, request, render_template_string
from transformers import TextStreamer, StoppingCriteria, BitsAndBytesConfig
from unsloth import FastLanguageModel
from peft import PeftModel
import torch
import queue
import threading

app = Flask(__name__)

# Quantization ayarları
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16,
    bnb_4bit_use_double_quant = True,
)

# Model ve tokenizer'ı yükle
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "C:\\Users\\Mehmet\\Desktop\\Denizhan2\\model_egitim\\checkpoint-1500",
    max_seq_length = 2048,
    dtype = torch.bfloat16,
    load_in_4bit = True,
    quantization_config = quantization_config,
    device_map = "auto",
)

# PEFT adaptörünü yükle
model = PeftModel.from_pretrained(model, "C:\\Users\\Mehmet\\Desktop\\Denizhan2\\model_egitim\\checkpoint-1500")
FastLanguageModel.for_inference(model)
model.eval()

# Thread yönetimi
current_stop_event = None
stop_lock = threading.Lock()

class StopGenerationCriteria(StoppingCriteria):
    def __init__(self, stop_event):
        super().__init__()
        self.stop_event = stop_event

    def __call__(self, input_ids, scores, **kwargs):
        return self.stop_event.is_set()

class WebStreamer(TextStreamer):
    def __init__(self, tokenizer, queue, **kwargs):
        super().__init__(tokenizer, **kwargs)
        self.queue = queue

    def on_finalized_text(self, text: str, stream_end: bool = False):
        self.queue.put(text)
        if stream_end:
            self.queue.put(None)

def format_alpaca_prompt(instruction):
    return f"""Aşağıda bir görevi açıklayan bir talimat bulunmaktadır. İsteği uygun şekilde tamamlayan bir yanıt yazın.

### Talimat:
{instruction}

### Yanıt:
"""

@app.route('/')
def index():
    return render_template_string('''
<!DOCTYPE html>
<html lang="tr">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Alpaca Chat</title>
  <style>
    /* Stil tanımları aynı kalıyor */
    .container { max-width: 800px; margin: 0 auto; padding: 20px; height: 100vh; display: flex; flex-direction: column; }
    .chat-area { flex-grow: 1; overflow-y: auto; background: #fff; border-radius: 8px; padding: 20px; margin: 10px 0; }
    .message { max-width: 75%; padding: 10px; margin: 5px 0; border-radius: 8px; }
    .user { background: #dcf8c6; margin-left: auto; }
    .ai { background: #f0f0f0; }
    .input-container { display: flex; gap: 10px; }
    textarea { flex-grow: 1; padding: 10px; border-radius: 8px; border: 1px solid #ddd; }
    button { background: #007bff; color: white; border: none; padding: 10px 20px; border-radius: 8px; cursor: pointer; }
  </style>
</head>
<body>
  <div class="container">
    <h1>Alpaca Chat</h1>
    <div id="chatArea" class="chat-area"></div>
    <div class="input-container">
      <textarea id="prompt" rows="3" placeholder="Alpaca modeli için sorunuzu girin..."></textarea>
      <button onclick="sendPrompt()">Gönder</button>
    </div>
  </div>

  <script>
    let currentEventSource = null;

    function sendPrompt() {
      const prompt = document.getElementById('prompt').value.trim();
      if (!prompt) return;

      const chatArea = document.getElementById('chatArea');
      
      // Kullanıcı mesajını ekle
      const userDiv = document.createElement('div');
      userDiv.className = 'message user';
      userDiv.textContent = prompt;
      chatArea.appendChild(userDiv);

      // AI mesaj konteyneri
      const aiDiv = document.createElement('div');
      aiDiv.className = 'message ai';
      chatArea.appendChild(aiDiv);
      
      // Önceki bağlantıyı kapat
      if(currentEventSource) currentEventSource.close();

      // Yeni istek başlat
      currentEventSource = new EventSource(`/generate?prompt=${encodeURIComponent(prompt)}`);
      
      currentEventSource.onmessage = (e) => {
        if(e.data === 'DONE') {
          currentEventSource.close();
          aiDiv.innerHTML += '<div style="color: #666; font-size: 0.8em">▼ Cevap Tamamlandı</div>';
          return;
        }
        aiDiv.textContent += e.data;
        chatArea.scrollTop = chatArea.scrollHeight;
      };
      
      document.getElementById('prompt').value = '';
      chatArea.scrollTop = chatArea.scrollHeight;
    }
  </script>
</body>
</html>
''')

@app.route('/generate')
def generate():
    global current_stop_event
    
    with stop_lock:
        if current_stop_event:
            current_stop_event.set()
        current_stop_event = threading.Event()
    
    stop_event = current_stop_event
    prompt = request.args.get('prompt', '')
    response_queue = queue.Queue()

    # Alpaca formatına dönüştür
    formatted_prompt = format_alpaca_prompt(prompt)
    
    # Tokenizer ayarları
    inputs = tokenizer(
        formatted_prompt,
        return_tensors = "pt",
        padding = True,
        truncation = True,
        max_length = 2048,
    ).to("cuda")

    # Streamer ayarları
    streamer = WebStreamer(
        tokenizer = tokenizer,
        queue = response_queue,
        skip_prompt = True,
        skip_special_tokens = True,
        clean_up_tokenization_spaces = True,
    )

    def generation_task():
        try:
            model.generate(
                **inputs,
                streamer = streamer,
                max_new_tokens = 2048,
                temperature = 0.6,
                top_p = 0.9,
                top_k = 40,
                do_sample = True,
                repetition_penalty = 1.15,
                eos_token_id = tokenizer.eos_token_id,
                pad_token_id = tokenizer.pad_token_id,
                stopping_criteria = [StopGenerationCriteria(stop_event)],
            )
        finally:
            response_queue.put(None)

    threading.Thread(target=generation_task).start()

    def stream():
        while True:
            chunk = response_queue.get()
            if chunk is None: 
                yield "data: DONE\n\n"
                break
            yield f"data: {chunk}\n\n"
    
    return Response(stream(), mimetype="text/event-stream")

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, threaded=True)

==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA RTX A5000. Max memory: 23.988 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [1]:

from flask import Flask, Response, request, render_template_string
from transformers import TextStreamer, StoppingCriteria, BitsAndBytesConfig
from unsloth import FastLanguageModel
from peft import PeftModel
import torch
import queue
import threading

app = Flask(__name__)

# Quantization ayarları
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16,
    bnb_4bit_use_double_quant = True,
)

# Model ve tokenizer'ı yükle
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "C:\\Users\\Mehmet\\Desktop\\Denizhan2\\model_egitim\\checkpoint-1500",
    max_seq_length = 2048,
    dtype = torch.bfloat16,
    load_in_4bit = True,
    quantization_config = quantization_config,
    device_map = "auto",
)

# PEFT adaptörünü yükle
model = PeftModel.from_pretrained(model, "C:\\Users\\Mehmet\\Desktop\\Denizhan2\\model_egitim\\checkpoint-1500")
FastLanguageModel.for_inference(model)
model.eval()

# Thread yönetimi
current_stop_event = None
stop_lock = threading.Lock()

class StopGenerationCriteria(StoppingCriteria):
    def __init__(self, stop_event):
        super().__init__()
        self.stop_event = stop_event

    def __call__(self, input_ids, scores, **kwargs):
        return self.stop_event.is_set()

class WebStreamer(TextStreamer):
    def __init__(self, tokenizer, queue, **kwargs):
        super().__init__(tokenizer, **kwargs)
        self.queue = queue

    def on_finalized_text(self, text: str, stream_end: bool = False):
        self.queue.put(text)
        if stream_end:
            self.queue.put(None)



def format_alpaca_prompt(instruction):
    return f"""Aşağıda bir görevi açıklayan bir talimat bulunmaktadır. İsteği uygun şekilde tamamlayan bir yanıt yazın.

### Talimat:
{instruction}

### Yanıt:
"""

@app.route('/')
def index():
    return render_template_string('''
<!DOCTYPE html>
<html lang="tr">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Alpaca Chat</title>
  <style>
    /* Stil tanımları aynı kalıyor */
    .container { max-width: 800px; margin: 0 auto; padding: 20px; height: 100vh; display: flex; flex-direction: column; }
    .chat-area { flex-grow: 1; overflow-y: auto; background: #fff; border-radius: 8px; padding: 20px; margin: 10px 0; }
    .message { max-width: 75%; padding: 10px; margin: 5px 0; border-radius: 8px; }
    .user { background: #dcf8c6; margin-left: auto; }
    .ai { background: #f0f0f0; }
    .input-container { display: flex; gap: 10px; }
    textarea { flex-grow: 1; padding: 10px; border-radius: 8px; border: 1px solid #ddd; }
    button { background: #007bff; color: white; border: none; padding: 10px 20px; border-radius: 8px; cursor: pointer; }
  </style>
</head>
<body>
  <div class="container">
    <h1>Alpaca Chat</h1>
    <div id="chatArea" class="chat-area"></div>
    <div class="input-container">
      <textarea id="prompt" rows="3" placeholder="Alpaca modeli için sorunuzu girin..."></textarea>
      <button onclick="sendPrompt()">Gönder</button>
    </div>
  </div>

  <script>
    let currentEventSource = null;

    function sendPrompt() {
      const prompt = document.getElementById('prompt').value.trim();
      if (!prompt) return;

      const chatArea = document.getElementById('chatArea');
      
      // Kullanıcı mesajını ekle
      const userDiv = document.createElement('div');
      userDiv.className = 'message user';
      userDiv.textContent = prompt;
      chatArea.appendChild(userDiv);

      // AI mesaj konteyneri
      const aiDiv = document.createElement('div');
      aiDiv.className = 'message ai';
      chatArea.appendChild(aiDiv);
      
      // Önceki bağlantıyı kapat
      if(currentEventSource) currentEventSource.close();

      // Yeni istek başlat
      currentEventSource = new EventSource(`/generate?prompt=${encodeURIComponent(prompt)}`);
      
      currentEventSource.onmessage = (e) => {
        if(e.data === 'DONE') {
          currentEventSource.close();
          aiDiv.innerHTML += '<div style="color: #666; font-size: 0.8em">▼ Cevap Tamamlandı</div>';
          return;
        }
        aiDiv.textContent += e.data;
        chatArea.scrollTop = chatArea.scrollHeight;
      };
      
      document.getElementById('prompt').value = '';
      chatArea.scrollTop = chatArea.scrollHeight;
    }
  </script>
</body>
</html>
''')

@app.route('/generate')
def generate():
    global current_stop_event
    
    with stop_lock:
        if current_stop_event:
            current_stop_event.set()
        current_stop_event = threading.Event()
    
    stop_event = current_stop_event
    prompt = request.args.get('prompt', '')
    response_queue = queue.Queue()

    # Alpaca formatına dönüştür
    formatted_prompt = format_alpaca_prompt(prompt)
    
    # Tokenizer ayarları
    inputs = tokenizer(
        formatted_prompt,
        return_tensors = "pt",
        padding = True,
        truncation = True,
        max_length = 2048,
    ).to("cuda")

    # Streamer ayarları
    streamer = WebStreamer(
        tokenizer = tokenizer,
        queue = response_queue,
        skip_prompt = True,
        skip_special_tokens = True,
        clean_up_tokenization_spaces = True,
    )

    def generation_task():
        try:
            model.generate(
                **inputs,
                streamer = streamer,
                max_new_tokens = 2048,
                temperature = 0.6,
                top_p = 0.9,
                top_k = 40,
                do_sample = True,
                repetition_penalty = 1.15,
                eos_token_id = tokenizer.eos_token_id,
                pad_token_id = tokenizer.pad_token_id,
                stopping_criteria = [StopGenerationCriteria(stop_event)],
               
                 
            )
        finally:
            response_queue.put(None)

    threading.Thread(target=generation_task).start()

    def stream():
        while True:
            chunk = response_queue.get()
            if chunk is None: 
                yield "data: DONE\n\n"
                break
            yield f"data: {chunk}\n\n"
    
    return Response(stream(), mimetype="text/event-stream")

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, threaded=True)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA RTX A5000. Max memory: 23.988 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.
  self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 