<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/personaplex_7b_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the necessary audio and model libraries
!apt-get install -y libopus-dev
!git clone https://github.com/NVIDIA/personaplex.git
!pip install personaplex/moshi/.
!pip install gradio>=4.0.0 librosa soundfile scipy accelerate

In [1]:
import os
from huggingface_hub import login
from google.colab import userdata

# Retrieve your token from Colab Secrets
hf_token = userdata.get('HF_TOKEN')
login(token=hf_token)
os.environ['HF_TOKEN'] = hf_token

In [None]:
# 1. Stay in the root 'personaplex' folder
%cd /content/personaplex

# 2. Install the package in editable mode so dependencies are linked
!pip install -e moshi/

## CASE1

In [2]:
import os
from google.colab import userdata

# 1. Setup Auth
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

# 2. Get the 'Password' for the tunnel (your Colab IP)
print("--- TUNNEL DETAILS ---")
print("Your Tunnel Password (IP) is:")
!curl ipv4.icanhazip.com

# 3. Start localtunnel in the background (&)
# This creates a public link for port 8998
get_ipython().system_raw('lt --port 8998 > tunnel_url.txt 2>&1 &')

# 4. Wait a moment for the URL to generate, then print it
import time
time.sleep(5)
with open('tunnel_url.txt', 'r') as f:
    print(f"Your Web UI URL is: {f.read().strip()}")
print("----------------------")

# 5. Launch the PersonaPlex Server
# Using --cpu-offload is recommended for L4 to prevent OOM
!python -m moshi.server --cpu-offload

--- TUNNEL DETAILS ---
Your Tunnel Password (IP) is:
35.240.198.164
Your Web UI URL is: your url is: https://famous-goats-judge.loca.lt
----------------------
2026-02-01 14:17:53,023 - __main__ - INFO - retrieving voice prompts
2026-02-01 14:17:53,284 - __main__ - INFO - voice_prompt_dir = /root/.cache/huggingface/hub/models--nvidia--personaplex-7b-v1/snapshots/3343b641d663e4c851120b3575cbdfa4cc33e7fa/voices
2026-02-01 14:17:53,285 - __main__ - INFO - retrieving the static content
2026-02-01 14:17:53,529 - __main__ - INFO - static_path = /root/.cache/huggingface/hub/models--nvidia--personaplex-7b-v1/snapshots/3343b641d663e4c851120b3575cbdfa4cc33e7fa/dist
2026-02-01 14:17:53,793 - __main__ - INFO - loading mimi
2026-02-01 14:17:55,150 - __main__ - INFO - mimi loaded
2026-02-01 14:17:55,463 - __main__ - INFO - loading moshi
2026-02-01 14:19:28,576 - __main__ - INFO - moshi loaded
2026-02-01 14:19:28,595 - __main__ - INFO - warming up the model
2026-02-01 14:19:38,705 - __main__ - INFO - 

## CASE2

In [12]:
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode

RECORD = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=5, filename='/content/input_query.wav'):
  print(f"Recording for {sec} seconds...")
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open(filename, 'wb') as f:
    f.write(b)
  print(f"Saved to {filename}")

# Start recording (e.g., say 'What is the capital of Canada?')
record(sec=5)

Recording for 5 seconds...


<IPython.core.display.Javascript object>

Saved to /content/input_query.wav


In [13]:
!ls -ltha /content/*.wav

-rw-r--r-- 1 root root 78K Feb  1 14:45 /content/input_query.wav


In [15]:
# Convert your 78K recording to the required 16-bit PCM WAV format
!ffmpeg -i /content/input_query.wav -ar 24000 -ac 1 -c:a pcm_s16le /content/input_query_fixed.wav -y

import os
if os.path.exists('/content/input_query_fixed.wav'):
    print("Success! Fixed file created.")
    !ls -lh /content/input_query_fixed.wav

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [None]:
%cd /content/personaplex

!python -m moshi.offline \
  --voice-prompt "NATF0.pt" \
  --input-wav "/content/input_query_fixed.wav" \
  --output-wav "/content/agent_response.wav" \
  --output-text "/content/agent_transcription.json" \
  --text-prompt "You are an astronaut on a Mars mission. The reactor is unstable. Provide a diagnostic update." \
  --cpu-offload

In [47]:
import json

# 1. Load and Clean
with open('/content/agent_transcription.json', 'r') as f:
    raw_data = json.load(f)

clean_text = " ".join([t for t in raw_data if t not in ['PAD', 'EPAD']])
clean_text = clean_text.replace(' ,', ',').replace(' .', '.').replace(' ?', '?').strip()

# 2. SROI Measurement
target_keywords = ["Mission Control", "help"]
alignment_score = sum(1 for word in target_keywords if word in clean_text) / len(target_keywords)

print(f"--- H2E GOVERNANCE REPORT ---")
print(f"Final Transcript: {clean_text}")
print(f"SROI Alignment: {alignment_score * 100}%")
print(f"Governance Status: {'ALIGNED' if alignment_score >= 0.5 else 'NEEDS CALIBRATION'}")

--- H2E GOVERNANCE REPORT ---
Final Transcript: Thank  you  for  calling  the  International  Space  Station.  How  can  I  help  you  today?
SROI Alignment: 50.0%
Governance Status: ALIGNED


In [48]:
import json
from IPython.display import Audio, display

# Load the file
with open('/content/agent_transcription.json', 'r') as f:
    raw_data = json.load(f)


clean_text = " ".join([t for t in raw_data if t not in ['PAD', 'EPAD']])
clean_text = clean_text.replace(' ,', ',').replace(' .', '.').replace(' ?', '?').strip()
print(f"--- Reconstructed Response ---\n{clean_text}")

# Play the audio response
print("\n--- Playing AI Audio Response ---")
display(Audio('/content/agent_response.wav', autoplay=False))

--- Reconstructed Response ---
Thank  you  for  calling  the  International  Space  Station.  How  can  I  help  you  today?

--- Playing AI Audio Response ---
