In [None]:
# Run this cell first to set up everything
!pip install -q pyngrok streamlit transformers peft accelerate bitsandbytes

import subprocess
import threading
from pyngrok import ngrok, conf
import time
import requests
import os

# Configure ngrok with your authtoken
# REPLACE 'YOUR_AUTHTOKEN_HERE' with your actual ngrok authtoken
from google.colab import userdata
token = userdata.get('NGROK_TOKEN')
conf.get_default().auth_token = token

# Function to run Streamlit in the background
def run_streamlit():
    # Write the Streamlit app to a file
    app_code = '''import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Set page configuration
st.set_page_config(
    page_title="Llama-3.1 Code Generator",
    page_icon="🤖",
    layout="wide"
)

# App title and description
st.title("🤖 Llama-3.1 Code Generator")
st.markdown("Generate Python code using fine-tuned Llama-3.1 model with LoRA adapter")

# Sidebar for configuration
with st.sidebar:
    st.header("⚙️ Generation Parameters")

    # Temperature slider
    temperature = st.slider(
        "Temperature",
        min_value=0.1,
        max_value=2.0,
        value=0.7,
        step=0.1,
        help="Controls randomness: Lower = more deterministic, Higher = more creative"
    )

    # Max length slider
    max_length = st.slider(
        "Max Length",
        min_value=50,
        max_value=1000,
        value=200,
        step=50,
        help="Maximum number of tokens to generate"
    )

    # Top-k slider
    top_k = st.slider(
        "Top-k",
        min_value=1,
        max_value=100,
        value=50,
        step=1,
        help="Number of highest probability tokens to consider for sampling"
    )

    st.markdown("---")
    st.info("Adjust these parameters to control the generation behavior.")

# Initialize session state for model loading
if 'model_loaded' not in st.session_state:
    st.session_state.model_loaded = False
if 'model' not in st.session_state:
    st.session_state.model = None
if 'tokenizer' not in st.session_state:
    st.session_state.tokenizer = None
if 'prompt' not in st.session_state:
    st.session_state.prompt = ""

# Model loading section
def load_model():
    """Load the model and tokenizer"""
    try:
        with st.spinner("Loading base model and tokenizer..."):
            base_model_name = "unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit"
            adapter_path = "fatoma/Llama-3.1-8B-Python-Coder"  # Public adapter

            tokenizer = AutoTokenizer.from_pretrained(base_model_name)
            model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                device_map="auto",
                load_in_4bit=True
            )

            with st.spinner("Loading LoRA adapter..."):
                model = PeftModel.from_pretrained(model, adapter_path)
                model.eval()

            st.session_state.tokenizer = tokenizer
            st.session_state.model = model
            st.session_state.model_loaded = True

            st.success("Model loaded successfully!")

    except Exception as e:
        st.error(f"Error loading model: {str(e)}")

# Load model button
if not st.session_state.model_loaded:
    st.info("Click the button below to load the model (this may take a few minutes)")
    if st.button("🚀 Load Model"):
        load_model()
else:
    st.success("✅ Model is ready!")

# Generation function
def generate_response(prompt, max_length, temperature, top_k):
    """Generate a response from the model"""
    try:
        inputs = st.session_state.tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(st.session_state.model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = st.session_state.model.generate(
                **inputs,
                max_new_tokens=max_length,
                temperature=temperature,
                top_k=top_k,
                do_sample=True,
                pad_token_id=st.session_state.tokenizer.eos_token_id
            )

        response = st.session_state.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response
    except Exception as e:
        return f"Error during generation: {str(e)}"

# Main input area
st.subheader("💬 Enter your coding prompt")

# Text input with examples - now using session state
prompt = st.text_area(
    "Prompt:",
    height=100,
    value=st.session_state.prompt,
    placeholder="e.g., Create a function to calculate the sum of a sequence of integers...",
    help="Enter a description of the code you want to generate"
)

# Example prompts with fixed session state handling
col1, col2, col3 = st.columns(3)
with col1:
    if st.button("Example 1"):
        st.session_state.prompt = "Write a function to reverse a string"
        st.rerun()
with col2:
    if st.button("Example 2"):
        st.session_state.prompt = "Create a function to calculate the sum of a sequence of integers."
        st.rerun()
with col3:
    if st.button("Example 3"):
        st.session_state.prompt = "Create a Python program to sort and print out the elements of an array of integers"
        st.rerun()

# Generate button
if st.button("✨ Generate Code", type="primary", disabled=not st.session_state.model_loaded):
    if not prompt.strip():
        st.warning("Please enter a prompt first!")
    else:
        with st.spinner("Generating code..."):
            response = generate_response(prompt, max_length, temperature, top_k)

        # Display results
        st.subheader("📋 Generated Code")

        # Create tabs for different views
        tab1, tab2 = st.tabs(["Formatted", "Raw Output"])

        with tab1:
            # Try to extract just the code part (simplified approach)
            code_lines = []
            in_code_block = False

            for line in response.split('\\n'):
                if line.strip().startswith('```'):
                    in_code_block = not in_code_block
                    continue
                if in_code_block or (line.strip() and not line.strip().startswith('#')):
                    code_lines.append(line)

            code_output = '\\n'.join(code_lines)

            if code_output.strip():
                st.code(code_output, language='python')
            else:
                st.code(response, language='text')

        with tab2:
            st.text_area("Raw output", response, height=300, disabled=True)

        # Display generation info
        st.markdown("---")
        col1, col2, col3 = st.columns(3)
        with col1:
            st.metric("Temperature", f"{temperature:.1f}")
        with col2:
            st.metric("Max Length", max_length)
        with col3:
            st.metric("Top-k", top_k)

# Add some information about the model
with st.expander("ℹ️ About this Model"):
    st.markdown("""
    **Model Details:**
    - Base Model: `unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit`
    - Fine-tuned with LoRA adapter for Python code generation
    - Optimized for coding tasks and Python development

    **Generation Parameters:**
    - **Temperature**: Controls randomness (0.1-2.0)
    - **Max Length**: Maximum tokens to generate
    - **Top-k**: Number of tokens to consider during sampling

    **Note**: First generation might be slower as the model warms up.
    """)

# Footer
st.markdown("---")
st.caption("Powered by Transformers, PEFT, and Streamlit")'''

    with open('app.py', 'w') as f:
        f.write(app_code)

    # Run Streamlit
    subprocess.run(['streamlit', 'run', 'app.py', '--server.port', '8501', '--server.address', '0.0.0.0'])

# Start Streamlit in a separate thread
thread = threading.Thread(target=run_streamlit)
thread.start()

# Wait a bit for the server to start
time.sleep(5)

# Set up ngrok tunnel
public_url = ngrok.connect(8501, bind_tls=True)
print("Your Streamlit app is now available at:")
print(f"👉 {public_url}")

# Display clickable link
from IPython.display import HTML
display(HTML(f'<a href="{public_url}" target="_blank">Open Llama-3.1 Code Generator</a>'))

# Keep the tunnel open
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("Shutting down...")
    ngrok.kill()

Your Streamlit app is now available at:
👉 NgrokTunnel: "https://acd91d475c25.ngrok-free.app" -> "http://localhost:8501"


Shutting down...
