Skip to content

Commit

Permalink
whisper -> deepgram
Browse files Browse the repository at this point in the history
  • Loading branch information
ckiee committed Jan 21, 2023
1 parent b3e5665 commit c725691
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 105 deletions.
1 change: 1 addition & 0 deletions .envrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
eval "$(lorri direnv)"
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@
go.work

*.pem
.dg-api-key
31 changes: 22 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,24 +1,37 @@
# mumble-whisper-go
OpenAI whisper transcription bot for Mumble.
# ~~mumble-whisper-go~~
# mumble-deepgram-go
~~OpenAI Whisper~~ [Deepgram](https://deepgram.com) transcription bot for Mumble.

## reqs

- [`whisper.cpp`](https://github.com/ggerganov/whisper.cpp/) (i'm on `ac521a566ea6a79ba968c30101140db9f65d187b "ggml : simplify the SIMD code (#324)"`)
- go (i'm on 1.19.3)
- an account on deepgram and an API key from it
- go (i'm on 1.19.4)
- openssl
- (see `go.mod`)

## get started

``` shellsession
# (NixOS: Whisper dependency config)
nix-shell -p
export NIX_LDFLAGS="${NIX_LDFLAGS}-L /home/ckie/git/whisper.cpp -rpath /home/ckie/git/whisper.cpp"
export NIX_CFLAGS_COMPILE="${NIX_CFLAGS_COMPILE}-I /home/ckie/git/whisper.cpp"
# (Generate cert)
$ openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -sha256 -days 365 -nodes
# (Run)
$ go mod download
$ go run main.go
$ go run main.go -server mumble.cyberia.club:64738 -certificate cert.pem -key key.pem -insecure -username $USER-bot -apikey $(cat .dg-api-key)
# (✨)
```

## nix
if you're using nix, you can enable the glorious direnv (perhaps in your home-manager config!)
along with lorri, run `direnv allow` and your direnv-compatible editor should pick up on the env
changes and use the gopls binary specified in `shell.nix` -- it also puts openssl and go in PATH.

## why didn't you like whisper

i did kinda like it but there's a bunch of post-processing to do and whisper.cpp
runs on CPU-only, 4 threads or something.

i saw deepgram on a [zack freedman](https://www.youtube.com/@ZackFreedman/videos) youtube video
and the thing just fucking works except for little-endian u16s being a weird choice.

i hope soon enough we'll have something that fits nicely on a raspi and can do all the things with like, 40% resource util.

4 changes: 4 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@ module github.com/ckiee/mumble-whisper-go
go 1.19

require (
github.com/Jeffail/gabs/v2 v2.6.1
github.com/deepgram-devs/go-sdk v0.4.0
github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20221224082228-ac521a566ea6
github.com/gorilla/websocket v1.5.0
layeh.com/gumble v0.0.0-20221205141517-d1df60a3cc14
)

require (
github.com/golang/protobuf v1.3.1 // indirect
github.com/google/go-querystring v1.1.0 // indirect
layeh.com/gopus v0.0.0-20161224163843-0ebf989153aa // indirect
)
10 changes: 10 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
github.com/Jeffail/gabs/v2 v2.6.1 h1:wwbE6nTQTwIMsMxzi6XFQQYRZ6wDc1mSdxoAN+9U4Gk=
github.com/Jeffail/gabs/v2 v2.6.1/go.mod h1:xCn81vdHKxFUuWWAaD5jCTQDNPBMh5pPs9IJ+NcziBI=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/dchote/go-openal v0.0.0-20171116030048-f4a9a141d372/go.mod h1:74z+CYu2/mx4N+mcIS/rsvfAxBPBV9uv8zRAnwyFkdI=
github.com/deepgram-devs/go-sdk v0.4.0 h1:mqiVQV98oVbFdiohRQKfmalBv33Ky4OPWKohwuifxuU=
github.com/deepgram-devs/go-sdk v0.4.0/go.mod h1:jbAJ5T1jDtTGWqOwJY8Grc2EzIgYerL+UHMCZrJniK4=
github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20221224082228-ac521a566ea6 h1:BqEVnbeJee5rKqK6vgUjlL7IPsyFa/6tYXu2yxdjRoM=
github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20221224082228-ac521a566ea6/go.mod h1:QIjZ9OktHFG7p+/m3sMvrAJKKdWrr1fZIK0rM6HZlyo=
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8=
github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU=
github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc=
github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
layeh.com/gopus v0.0.0-20161224163843-0ebf989153aa h1:WNU4LYsgD2UHxgKgB36mL6iMAMOvr127alafSlgBbiA=
layeh.com/gopus v0.0.0-20161224163843-0ebf989153aa/go.mod h1:AOef7vHz0+v4sWwJnr0jSyHiX/1NgsMoaxl+rEPz/I0=
Expand Down
202 changes: 106 additions & 96 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,116 +4,28 @@ import (
"crypto/tls"
"flag"
"fmt"
whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
"layeh.com/gumble/gumble"
"layeh.com/gumble/gumbleutil"
_ "layeh.com/gumble/opus"
"log"
"net"
"os"
"strconv"
"time"
)

type WhisperAudioListener struct{ modelFile string }
// "time"

func (al WhisperAudioListener) OnAudioStream(e *gumble.AudioStreamEvent) {
fmt.Println("OnAudioStream")
fmt.Println(*e.User)
fmt.Print("packet:")
// fmt.Println(*<-e.C)
// fmt.Println("--")
go func() {
var samples []float32
whisperCh := make(chan []float32, 0)
go al.audioWhisperConsumer(e.Client, whisperCh, e.User.Name)
last := time.Now()
drop := 0
for {
select {
case pkt := <-e.C:
start := time.Now()
for i := 0; i < len(pkt.AudioBuffer); i += 3 /*mumble:whisper sample rate ratio*/ {
// fmt.Println(pkt.AudioBuffer[i])
var s float32
for j := 0; j < 3; j++ {
s += float32(pkt.AudioBuffer[i+j]) / 65535.0
}
s /= 3.0
samples = append(samples, s)
}
frameSize := 3
if len(samples) > 16000*frameSize {
fmt.Printf("got %d000ms frame in %d ms\n", frameSize, time.Now().Sub(last).Milliseconds())
if drop == 0 {
whisperCh <- samples
} else {
drop--
}
samples = make([]float32, 0, 16000*10)
last = time.Now()
tookMs := time.Now().Sub(start).Milliseconds()
if int(tookMs) > frameSize*1e3 {
s := fmt.Sprintf("!!!!!! UNDERRUN: copy took %fms, budget is %ds -- dropping..", tookMs, frameSize)
e.Client.Self.Channel.Send(s, false)
fmt.Println(s)
drop += int(int(tookMs) / frameSize)
}
fmt.Printf("copy took %s ms [%d backed up in whisperCh]\n", tookMs, len(whisperCh))
}
default:
continue
}
}
}()
}

func (al WhisperAudioListener) audioWhisperConsumer(client *gumble.Client, c chan []float32, username string) {
model, err := whisper.New(al.modelFile)
if err != nil {
panic(err)
}
defer model.Close()
for {
frame := <-c
start := time.Now()
context, err := model.NewContext()
context.SetSpeedup(true)
fmt.Println("context.Process")
if err = context.Process(frame, nil); err != nil {
fmt.Fprintf(os.Stderr, "!\ncontext.Process: %s\n", err)
}
whisperMs := time.Now().Sub(start).Milliseconds()
fmt.Printf("context.Process: done [%s ms]\n", whisperMs)
for {
segment, err := context.NextSegment()
if err != nil {
fmt.Fprintf(os.Stderr, "context.NextSegment: %s\n", err)
break
}
caption := fmt.Sprintf("[%6s->%6s %s; w%d ms] %s\n", segment.Start, segment.End, username, whisperMs, segment.Text)
client.Self.Channel.Send(caption, false)
fmt.Println(caption)
}
}
}
"github.com/Jeffail/gabs/v2"
"github.com/deepgram-devs/go-sdk/deepgram"
"github.com/gorilla/websocket"
)

// Main aids in the creation of a basic command line gumble bot. It accepts the
// following flag arguments:
//
// --server
// --username
// --password
// --insecure
// --certificate
// --key
func main() {
server := flag.String("server", "localhost:64738", "Mumble server address")
username := flag.String("username", "gumble-bot", "client username")
password := flag.String("password", "", "client password")
insecure := flag.Bool("insecure", false, "skip server certificate verification")
certificateFile := flag.String("certificate", "", "user certificate file (PEM)")
keyFile := flag.String("key", "", "user certificate key file (PEM)")
modelFile := flag.String("model", "models/ggml-base.en.bin", "OpenAI whisper.cpp model path")
dgKey := flag.String("apikey", "", "deepgram API key")

if !flag.Parsed() {
flag.Parse()
Expand Down Expand Up @@ -177,7 +89,7 @@ func main() {

},
})
config.AttachAudio(WhisperAudioListener{modelFile: *modelFile})
config.AttachAudio(TranscriptAudioListener{dgApiKey: *dgKey})
_, err = gumble.DialWithDialer(new(net.Dialer), address, config, &tlsConfig)
if err != nil {
fmt.Fprintf(os.Stderr, "%s: %s\n", os.Args[0], err)
Expand All @@ -186,3 +98,101 @@ func main() {

<-keepAlive
}

type TranscriptAudioListener struct{ dgApiKey string }

func (al TranscriptAudioListener) OnAudioStream(e *gumble.AudioStreamEvent) {
fmt.Println("OnAudioStream")
fmt.Println(*e.User)
fmt.Print("packet:")
// fmt.Println(*<-e.C)
// fmt.Println("--")
go func() {
var samples []byte // u16, acktually.
transcriptCh := make(chan []byte, 0)
go al.audioTranscriptConsumer(e.Client, transcriptCh, e.User.Name)
// last := time.Now()
drop := 0
for {
select {
case pkt := <-e.C:
// start := time.Now()
for i := 0; i < len(pkt.AudioBuffer); i += 3 /*mumble:transcript sample rate ratio*/ {
// fmt.Println(pkt.AudioBuffer[i])
var s float32 // ugch fp slow but this isnt a µC i guess.
for j := 0; j < 3; j++ {
s += float32(pkt.AudioBuffer[i+j]) / 65535.0
}
s /= 3.0
s -= .5
s *= 65535.0 / 2.0
samples = append(samples, byte(int16(s)&0xff), byte(int16(s)>>8&0xff) /*little endian*/)
}
frameSize := 3200 // 5hz. to be tweaked.
if len(samples) > frameSize {
// fmt.Printf("got %dSmp frame in %d ms\n", frameSize, time.Now().Sub(last).Milliseconds())
if drop == 0 {
// TODO: a /very, very/ nice to have would be some speaker detection, so we don't
// send near-0 samples all the time and use up a bunch of credits for
// each speaker that sent us a single sample even.
transcriptCh <- samples
} else {
drop--
}
samples = make([]byte, 0, 16000*10*2)
// last = time.Now()
// tookMs := time.Now().Sub(start).Milliseconds()
// if int(tookMs) > frameSize {
// s := fmt.Sprintf("!!!!!! UNDERRUN: copy took %fms, budget is %dms -- dropping..", tookMs, frameSize)
// e.Client.Self.Channel.Send(s, false)
// fmt.Println(s)
// drop += int(int(tookMs) / frameSize) // wrong TODO fix
// }
// fmt.Printf("copy took %d ms [%d backed up in transcriptCh]\n", tookMs, len(transcriptCh))
}
default:
continue
}
}
}()
}

func (al TranscriptAudioListener) audioTranscriptConsumer(client *gumble.Client, c chan []byte /*u8 pairs to make u16's*/, speakerName string) {
dg := *deepgram.NewClient(al.dgApiKey)
options := deepgram.LiveTranscriptionOptions{
Language: "en-US",
Punctuate: true,
Sample_rate: 16000,
Channels: 1,
Encoding: "linear16",
Interim_results: true, // TODO: ask reese
}

dgConn, _, err := dg.LiveTranscription(options)
if err != nil {
log.Fatal(err)
}

go func() {
for {
_, message, err := dgConn.ReadMessage()
if err != nil {
fmt.Println("ERROR reading message")
log.Fatal(err)
}

fmt.Printf("recv [raw]: %s\n", string(message))
jsonParsed, jsonErr := gabs.ParseJSON(message)
if jsonErr != nil {
log.Fatal(err)
}
transcript := jsonParsed.Path("channel.alternatives.0.transcript").String()
log.Printf("recv [transcript]: %s\n", transcript)
client.Self.Channel.Send(fmt.Sprintf("[%s] %s", speakerName, transcript), false)
}
}()

for {
dgConn.WriteMessage(websocket.BinaryMessage, <-c)
}
}
11 changes: 11 additions & 0 deletions shell.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{ pkgs ? import <nixpkgs> {} }:

with pkgs;

mkShell {
buildInputs = [
gopls
go
openssl
];
}

0 comments on commit c725691

Please sign in to comment.