In [None]:
# Check your GPU type
!nvidia-smi

In [None]:
!pip install torch==2.3.0+cu121 --index-url https://download.pytorch.org/whl/cu121
!pip install "transformers>=4.41.2" "accelerate>=0.27.2"

In [None]:
!pip install vllm
#IMPORTANT - Restart the Colab runtime.

In [None]:
# Key Differences from transformers
# Model Loading: You use vllm.LLM() instead of AutoModelForCausalLM.

# Generation Parameters: Instead of passing temperature, top_p, etc., to the .generate() method, you bundle them into a SamplingParams object.

# Speed: You'll notice that once the model is loaded, the generation is significantly faster, especially if you were to pass multiple prompts at once (llm.generate([prompt1, prompt2, ...], sampling_params)).


In [None]:
import torch
from vllm import LLM, SamplingParams

# --- 1. Define your Model ID ---
# This is the repository ID from the Hugging Face Hub.
model_id = "Dushyant4342/ft-llama3-8b-credit-analyst"

# --- 2. Load the Model with vLLM ---
# The LLM class handles everything. tensor_parallel_size=1 is for a single GPU.
print(f"Loading model with vLLM: {model_id}")
llm = LLM(
    model=model_id,
    tensor_parallel_size=1,
    trust_remote_code=True
)
print("Model loaded successfully.")

# We still need a tokenizer just for preparing the prompt template
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:

# --- 3. Prepare Your Prompt ---
# The prompt structure remains the same.
system_prompt = "You are an expert credit analyst providing a concise, balanced summary of a customer's credit profile changes."
customer_data = """
--- Credit Profile Report for Customer: 3494583211374151124 ---

## Key Metric Summary
-  Risk Score : 737.0 (was 729.0)
-  Overall Utilization : 76.11% (was 83.31%)
-  Total Active Accounts : 1.0 (was 1.0)

## Current Credit Mix
-  Total Accounts : 1 (1.0 active)
-  Active Secured Products : 1
-  Active Unsecured Products : 0
-  Active Lender Distribution : Public(0), Private(0), NBFC(1), Corporate(0), Foreign(0)

## Payment Behavior History
-  Past Delinquencies : User has been delinquent on 0 accounts in their history.

## Account Details Breakdown

-  Account : TVS CREDIT -  Two-wheeler Loan (#3442)
  -  Status : Active
  -  DPD : 0.0 days (was 0.0 days)
  -  Utilization : 76.11% (was 83.31%)

"""
user_command = "Summarize the key takeaways in one bullet point."
user_content = f"{user_command}\n\n--- DATA ---\n{customer_data}"

msgs = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_content},
]


In [None]:
import time


In [None]:
%%time

# Apply the chat template to create the final prompt string
prompt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

# --- 4. Generate the Response ---
# Define sampling parameters
sampling_params = SamplingParams(
    temperature=0.9,
    top_p=0.8,     # tells the model to consider the most likely next words until their combined probability adds up to 90%.
    max_tokens=228  # Controls the maximum length of the output
)

# Generate output in batches (here, a batch of one)
outputs = llm.generate([prompt], sampling_params)

# --- 5. Print the Result ---
# The output is a list, so we access the first result.
response = outputs[0].outputs[0].text

print("\n--- vLLM Model Response ---")
print(response)

In [None]:
!python --version

In [None]:
%%time
customer_data = """
Credit score jumped to 660 (from 595) due to sharp drop in utilisation (0% from 52.56%) and closure of one account. However, user has history of 4 delinquencies, including a 900-day DPD with DHANI. New enquiry made with Home Credit on 20 Nov 2024.
"""


system_prompt = """You are a helpful financial advisor speaking directly to a customer about their credit report. Your task is to rewrite a summary of their credit report changes into a clear, direct message for them.

Always use the second person (e.g., "Your score," "you have," "your utilization") and avoid using the third person ("the user," "their score")."""



user_command = "Summarize the key takeaways in one bullet point."
user_content = f"{user_command}\n\n--- DATA ---\n{customer_data}"

msgs = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_content},
]

prompt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

sampling_params = SamplingParams(
    temperature=0.9,
    top_p=0.8,
    max_tokens=228
)

outputs = llm.generate([prompt], sampling_params)
response = outputs[0].outputs[0].text

print("\n--- vLLM Model Response ---")
print(response)