In [None]:
from pydantic import BaseModel
import torch

In [None]:
# This part of code will skip all the un-necessary warnings which can occur during the execution of this project.
import warnings
warnings.filterwarnings("ignore", category=Warning)

In [None]:
# Installation for GPU llama-cpp-python==0.2.69
!CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python==0.2.69
# For downloading the models from HF Hub
!pip install huggingface_hub

Collecting llama-cpp-python==0.2.69
  Downloading llama_cpp_python-0.2.69.tar.gz (42.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python==0.2.69)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.69-cp311-cp311-linux_x86_64.whl size=55715097 sha256=9a

In [None]:
# !pip uninstall llama-cpp-python
# CMAKE_ARGS="-DLLAMA_CUBLAS=on"
# !pip install --force-reinstall llama-cpp-python --no-cache-dir

In [None]:
import json

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from huggingface_hub import hf_hub_download
from llama_cpp import Llama

from tqdm import tqdm
from collections import Counter

In [None]:
# Load Mistral 7B model (Placeholder for actual model loading)
def load_model():
    model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
    model_basename = "mistral-7b-instruct-v0.2.Q5_K_M.gguf"
    model_path = hf_hub_download(
              repo_id=model_name_or_path,
              filename=model_basename
          )
    lcpp_llm = Llama(

            model_path=model_path,
            n_threads=2, # CPU cores
            n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
            n_gpu_layers=43, # Change this value based on your model and your GPU VRAM pool.
            n_ctx=4096 # Context window
        )
    return lcpp_llm

model = load_model()

mistral-7b-instruct-v0.2.Q5_K_M.gguf:   0%|          | 0.00/5.13G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /root/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.2-GGUF/snapshots/3a6fbf4a41a1d52e415a4958cde6856d34b2db93/mistral-7b-instruct-v0.2.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loa

In [None]:
!pip install flask flask-ngrok
!pip install pyngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [None]:
from pydantic import BaseModel
from flask import Flask, jsonify, request
from flask_ngrok import run_with_ngrok  # Ensure you have flask-ngrok installed
from pyngrok import ngrok

app = Flask(__name__)
run_with_ngrok(app)  # Enables ngrok

!ngrok authtoken 2flPCqXtK0mJarxJ8qKyWAdyP8P_5jAr786C4PjUnW9QWZ2kZ

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
class PredictionRequest(BaseModel):
  message: str
  max_tokens: int = 200

In [None]:
mistral_prompt_template = """<s>[INST]{prompt}[/INST]"""

answers_template = """
Context:
{context}
===
Using the context above generate {num_answers} distinct answers to the following question:
Question:
{question}.

Arrange your answers in numbered bullet points.
Present only the answers in bullet points.
"""

In [None]:
def generateModelResponse(user_query, user_query_context):
  answers_prompt = mistral_prompt_template.format(
      prompt=answers_template.format(
          context=user_query_context,
          question=user_query,
          num_answers=3
      )
  )

  response = model(
    prompt=answers_prompt,
    max_tokens=1024,
    temperature=0,
    top_p=0.95,
    repeat_penalty=1.2,
    echo=False # do not return the prompt
  )
  factual_answers = response["choices"][0]["text"].strip()
  return factual_answers

In [None]:
{"Predictive insights": generateModelResponse("I'm very happy with the product", "You are an AI assistant that performs sentiment analysis.")}

In [None]:
@app.post("/api/v1/predictiveInsight")
def predictive_insights():
    input = request.get_json()
    system_message = "You are an AI assistant that generates predictive insights."
    response = generateModelResponse(input.get("message", ""), system_message)
    return {"Predictive insights": response}

In [None]:
@app.post("/api/v1/sentiment")
def sentiment_analysis():
    input = request.get_json()
    system_message = "You are an AI assistant that performs sentiment analysis."
    response = generateModelResponse(input.get("message", ""), system_message)
    return {"User sentiment": response}

In [None]:
@app.post("/api/v1/personalizeRecommendation/")
def personalized_recommendation():
    input = request.get_json()
    system_message = "You are an AI assistant that provide personalized recommendations."
    response = generateModelResponse(input.get("message", ""), system_message)
    return {"recommendation": response}

In [None]:
@app.post("/api/v1/assessRisk/")
def risk_assessment():
    input = request.get_json()
    system_message = "You are an AI assistant that performs risk assessments."
    response = generateModelResponse(input.get("message", ""), system_message)
    return {"Risk_assessment": response}

In [None]:
@app.route("/", methods=["GET"])
def root():
    return {"message": "AI-driven API for predictive insights, personalization, sentiment analysis, and risk assessment"}

In [None]:
# Open a tunnel on the port Flask is running on (default 5000)
public_url = ngrok.connect(5000)
print(" * Tunnel URL:", public_url)

if __name__ == "__main__":
    app.run()

 * Tunnel URL: NgrokTunnel: "https://86f0-34-125-62-208.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Running on http://86f0-34-125-62-208.ngrok-free.app
 * Traffic stats available on http://127.0.0.1:4040


Llama.generate: prefix-match hit

llama_print_timings:        load time =     534.48 ms
llama_print_timings:      sample time =      29.52 ms /    57 runs   (    0.52 ms per token,  1931.16 tokens per second)
llama_print_timings: prompt eval time =     227.00 ms /    33 tokens (    6.88 ms per token,   145.38 tokens per second)
llama_print_timings:        eval time =    1644.44 ms /    56 runs   (   29.37 ms per token,    34.05 tokens per second)
llama_print_timings:       total time =    2036.75 ms /    89 tokens
INFO:werkzeug:127.0.0.1 - - [26/Mar/2025 07:07:52] "POST /api/v1/sentiment HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [26/Mar/2025 07:08:10] "[32mPOST /api/v1/personalizeRecommendation HTTP/1.1[0m" 308 -
Llama.generate: prefix-match hit

llama_print_timings:        load time =     534.48 ms
llama_print_timings:      sample time =      63.44 ms /   118 runs   (    0.54 ms per token,  1860.11 tokens per second)
llama_print_timings: prompt eval time =     217.01 ms /    64 to