In [79]:
from pydantic import BaseModel
import torch

In [80]:
# This part of code will skip all the un-necessary warnings which can occur during the execution of this project.
import warnings
warnings.filterwarnings("ignore", category=Warning)

In [81]:
# Installation for GPU llama-cpp-python==0.2.69
!CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python==0.2.69
# For downloading the models from HF Hub
!pip install huggingface_hub



In [82]:
!pip install llama-cpp-python



In [83]:
import json

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from huggingface_hub import hf_hub_download
from llama_cpp import Llama

from tqdm import tqdm
from collections import Counter

In [84]:
# Load Mistral 7B model (Placeholder for actual model loading)
def load_model():
    model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
    model_basename = "mistral-7b-instruct-v0.2.Q5_K_M.gguf"
    model_path = hf_hub_download(
              repo_id=model_name_or_path,
              filename=model_basename
          )
    lcpp_llm = Llama(

            model_path=model_path,
            n_threads=2, # CPU cores
            n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
            n_gpu_layers=43, # Change this value based on your model and your GPU VRAM pool.
            n_ctx=4096 # Context window
        )
    return lcpp_llm

model = load_model()

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /root/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.2-GGUF/snapshots/3a6fbf4a41a1d52e415a4958cde6856d34b2db93/mistral-7b-instruct-v0.2.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loa

In [85]:
!pip install flask flask-ngrok
!pip install pyngrok



In [206]:
from pydantic import BaseModel
from flask import Flask, jsonify, request
from flask_ngrok import run_with_ngrok  # Ensure you have flask-ngrok installed
from pyngrok import ngrok

app = Flask(__name__)
run_with_ngrok(app)  # Enables ngrok

!ngrok authtoken 2flPCqXtK0mJarxJ8qKyWAdyP8P_5jAr786C4PjUnW9QWZ2kZ

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [207]:
class PredictionRequest(BaseModel):
  message: str
  max_tokens: int = 200

In [208]:
mistral_prompt_template = """<s>[INST]{prompt}[/INST]"""

answers_template = """
Context:
{context}
===
Using the context above generate {num_answers} distinct answers to the following question:
Question:
{question}.

Arrange your answers in numbered bullet points.
Present only the answers in bullet points.
"""

In [209]:
def generateModelResponse(user_query, user_query_context):
  answers_prompt = mistral_prompt_template.format(
      prompt=answers_template.format(
          context=user_query_context,
          question=user_query,
          num_answers=3
      )
  )

  response = model(
    prompt=answers_prompt,
    max_tokens=1024,
    temperature=0,
    top_p=0.95,
    repeat_penalty=1.2,
    echo=False # do not return the prompt
  )
  factual_answers = response["choices"][0]["text"].strip()
  return factual_answers

In [210]:
{"Predicive insights": generateModelResponse("I'm very happy with the product", "You are an AI assistant that performs sentiment analysis.")}

Llama.generate: prefix-match hit

llama_print_timings:        load time =     383.23 ms
llama_print_timings:      sample time =      39.18 ms /    62 runs   (    0.63 ms per token,  1582.44 tokens per second)
llama_print_timings: prompt eval time =     218.62 ms /    59 tokens (    3.71 ms per token,   269.87 tokens per second)
llama_print_timings:        eval time =    1682.22 ms /    61 runs   (   27.58 ms per token,    36.26 tokens per second)
llama_print_timings:       total time =    2156.22 ms /   120 tokens


{'Predicive insights': '1. Positive feedback received: You have expressed satisfaction with the product.\n2. High level of happiness: Your sentiment towards the product is extremely positive.\n3. Product meets expectations: Based on your statement, it can be inferred that the product has met or exceeded your expectations.'}

In [211]:
@app.post("/predictive_insights/")
def predictive_insights():
    input = request.get_json()
    system_message = "You are an AI assistant that generates predictive insights."
    response = generateModelResponse(input.get("message", ""), system_message)
    return {"Predictive insights": response}

In [212]:
@app.post("/sentiment_analysis/")
def sentiment_analysis():
    input = request.get_json()
    system_message = "You are an AI assistant that performs sentiment analysis."
    response = generateModelResponse(input.get("message", ""), system_message)
    return {"User sentiment": response}

In [213]:
@app.post("/personalized_recommendation/")
def personalized_recommendation():
    input = request.get_json()
    system_message = "You are an AI assistant that provide personalized recommendations."
    response = generateModelResponse(input.get("message", ""), system_message)
    return {"recommendation": response}

In [214]:
@app.post("/risk_assessment/")
def risk_assessment():
    input = request.get_json()
    system_message = "You are an AI assistant that performs risk assessments."
    response = generateModelResponse(input.get("message", ""), system_message)
    return {"Risk_assessment": response}

In [215]:
@app.route("/", methods=["GET"])
def root():
    return {"message": "AI-driven API for predictive insights, personalization, sentiment analysis, and risk assessment"}

In [None]:
# Open a tunnel on the port Flask is running on (default 5000)
public_url = ngrok.connect(5000)
print(" * Tunnel URL:", public_url)

if __name__ == "__main__":
    app.run()



 * Tunnel URL: NgrokTunnel: "https://ba0e-34-126-111-206.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Running on http://ba0e-34-126-111-206.ngrok-free.app
 * Traffic stats available on http://127.0.0.1:4040


Llama.generate: prefix-match hit

llama_print_timings:        load time =     383.23 ms
llama_print_timings:      sample time =      71.90 ms /   126 runs   (    0.57 ms per token,  1752.48 tokens per second)
llama_print_timings: prompt eval time =     246.44 ms /    68 tokens (    3.62 ms per token,   275.93 tokens per second)
llama_print_timings:        eval time =    3440.26 ms /   125 runs   (   27.52 ms per token,    36.33 tokens per second)
llama_print_timings:       total time =    4082.77 ms /   193 tokens
INFO:werkzeug:127.0.0.1 - - [25/Mar/2025 20:18:52] "POST /predictive_insights/ HTTP/1.1" 200 -
Llama.generate: prefix-match hit

llama_print_timings:        load time =     383.23 ms
llama_print_timings:      sample time =      76.54 ms /   122 runs   (    0.63 ms per token,  1593.88 tokens per second)
llama_print_timings: prompt eval time =     232.23 ms /    52 tokens (    4.47 ms per token,   223.91 tokens per second)
llama_print_timings:        eval time =    3332.73 ms /