Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fencing a failed primary #49

Merged
merged 27 commits into from
Jan 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
cfd29e2
Zombie lock
davissp14 Jan 27, 2023
90f56be
Zombie logic
davissp14 Jan 28, 2023
1b04b52
Cleanup
davissp14 Jan 28, 2023
00a743d
Cleanup
davissp14 Jan 28, 2023
060c98e
This isn't used
davissp14 Jan 28, 2023
0509c2d
Bug fix, but need to clean this up
davissp14 Jan 28, 2023
bfc7f1f
CLeanup
davissp14 Jan 28, 2023
d088e4e
cleanup
davissp14 Jan 28, 2023
7e7f2a5
Bug fix and cleanup
davissp14 Jan 28, 2023
de06a36
More cleanup
davissp14 Jan 28, 2023
cb50b5a
zombie role should be a failing state
davissp14 Jan 28, 2023
0980fa8
Add some better logs for when we are unable to reconverge with a heal…
davissp14 Jan 28, 2023
cbaf83e
Cleaning up comments and logs
davissp14 Jan 28, 2023
3842491
Sleep for 2 minutes to give users opportunity to clear the zombie lock
davissp14 Jan 28, 2023
211db0d
Fix comment
davissp14 Jan 28, 2023
53201ca
Simplify diagnosis logic
davissp14 Jan 28, 2023
07b7441
Fix comments
davissp14 Jan 28, 2023
e082977
fix test reference
davissp14 Jan 28, 2023
59d2f97
Manual restart is no longer required.
davissp14 Jan 28, 2023
7c937f8
Updates based on feedback
davissp14 Jan 29, 2023
42bb779
rename
davissp14 Jan 29, 2023
93b3d7e
Fix log
davissp14 Jan 29, 2023
e355651
PG needs to boot even when a Zombie so manual intervention is possible.
davissp14 Jan 29, 2023
243e2cf
Minor cleanup
davissp14 Jan 29, 2023
b39d479
rounding out some edges
davissp14 Jan 30, 2023
104e878
Account of primary coming up in the middle of a failover
davissp14 Jan 30, 2023
fdc2d32
Fixed permission issue with pgbouncer and logging events to file for …
davissp14 Jan 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 71 additions & 10 deletions cmd/event_handler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ import (
"context"
"flag"
"fmt"
"os"
"strconv"
"time"

"github.com/fly-apps/postgres-flex/pkg/flypg"
)
Expand All @@ -21,27 +23,87 @@ func main() {
details := flag.String("details", "", "details")
flag.Parse()

fmt.Printf("Event: %s\n Node: %d\n Success: %s\n Details: %s\n",
*event, *nodeID, *success, *details)
eventDetails := fmt.Sprintf("%s - Event: %s\n Node: %d\n Success: %s\n Details: %s\n", time.Now().String(), *event, *nodeID, *success, *details)

// TODO - Use an actual logging framework instead of just writing strings to a file.
logFile, err := os.OpenFile("/data/event.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
fmt.Printf("failed to open event log: %s", err)
}
defer logFile.Close()

logFile.WriteString(eventDetails)

switch *event {

case "repmgrd_failover_promote", "standby_promote":
// TODO - Need to figure out what to do when success == 0.
if err := reconfigurePGBouncer(*nodeID); err != nil {
fmt.Println(err.Error())
return

retry := 0
maxRetries := 5
success := false

for retry < maxRetries {
if err := reconfigurePGBouncer(*nodeID); err != nil {
errMsg := fmt.Sprintf("%s [%s] attempt: %d - failed to reconfigure pgbouncer: %s\n", *event, time.Now().String(), retry, err)
logFile.WriteString(errMsg)

retry++
time.Sleep(1 * time.Second)
continue
}

success = true
break
}

if success {
msg := fmt.Sprintf("%s [%s] Successfully reconfigured pgBouncer to %d\n", *event, time.Now().String(), *nodeID)
logFile.WriteString(msg)
os.Exit(0)
} else {
msg := fmt.Sprintf(" %s [%s] Failed ot reconfigured pgBouncer to %d\n", *event, time.Now().String(), *nodeID)
logFile.WriteString(msg)
os.Exit(1)
}

case "standby_follow":

newMemberID, err := strconv.Atoi(*newPrimary)
if err != nil {
fmt.Printf("failed to parse new member id: %s", err)
errMsg := fmt.Sprintf("failed to parse newMemberID %s: %s\n", *newPrimary, err)
logFile.WriteString(errMsg)
os.Exit(1)
}

if err := reconfigurePGBouncer(newMemberID); err != nil {
fmt.Println(err.Error())
return
retry := 0
maxRetries := 5
success := false

for retry < maxRetries {
if err := reconfigurePGBouncer(*&newMemberID); err != nil {
errMsg := fmt.Sprintf("%s [%s] attempt: %d - failed to reconfigure pgbouncer: %s\n", *event, time.Now().String(), retry, err)
logFile.WriteString(errMsg)

retry++
time.Sleep(1 * time.Second)
continue
}

success = true
break
}

if success {
msg := fmt.Sprintf("%s [%s] Successfully reconfigured pgBouncer to %d\n", *event, time.Now().String(), newMemberID)
logFile.WriteString(msg)
os.Exit(0)
} else {
msg := fmt.Sprintf(" %s [%s] Failed ot reconfigured pgBouncer to %d\n", *event, time.Now().String(), newMemberID)
logFile.WriteString(msg)
os.Exit(1)
}

default:
// noop
}
Expand All @@ -63,7 +125,6 @@ func reconfigurePGBouncer(id int) error {
return err
}

fmt.Println("Reconfiguring pgbouncer primary")
if err := node.PGBouncer.ConfigurePrimary(context.TODO(), member.Hostname, true); err != nil {
return fmt.Errorf("failed to reconfigure pgbouncer primary %s", err)
}
Expand Down
6 changes: 4 additions & 2 deletions cmd/start/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,20 @@ func main() {
node, err := flypg.NewNode()
if err != nil {
panicHandler(err)
return
}

ctx := context.Background()

if err = node.Init(ctx); err != nil {
panicHandler(err)
return
}

go func() {
t := time.NewTicker(1 * time.Second)
defer t.Stop()
for range t.C {

if err := node.PostInit(ctx); err != nil {
fmt.Printf("failed post-init: %s. Retrying...\n", err)
continue
Expand All @@ -44,7 +45,8 @@ func main() {
}()

svisor := supervisor.New("flypg", 5*time.Minute)
svisor.AddProcess("flypg", fmt.Sprintf("gosu postgres postgres -D %s -p %d", node.DataDir, node.Port))

svisor.AddProcess("postgres", fmt.Sprintf("gosu postgres postgres -D %s -p %d", node.DataDir, node.Port))

svisor.AddProcess("pgbouncer", fmt.Sprintf("pgbouncer %s/pgbouncer.ini", node.PGBouncer.ConfigPath),
supervisor.WithRestart(0, 1*time.Second),
Expand Down
12 changes: 11 additions & 1 deletion pkg/flycheck/role.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package flycheck

import (
"context"
"fmt"

"github.com/fly-apps/postgres-flex/pkg/flypg"
"github.com/pkg/errors"
Expand All @@ -26,14 +27,23 @@ func PostgreSQLRole(ctx context.Context, checks *check.CheckSuite) (*check.Check
}

checks.AddCheck("role", func() (string, error) {
if flypg.ZombieLockExists() {
return "zombie", fmt.Errorf("member is in a zombie state. see logs for more details")
}

member, err := node.RepMgr.Member(ctx, conn)
if err != nil {
return "failed", errors.Wrap(err, "failed to check role")
}

switch member.Role {
case flypg.PrimaryRoleName:
return "primary", nil
if member.Active {
return "primary", nil
} else {
return "zombie", nil
}

case flypg.StandbyRoleName:
return "replica", nil
default:
Expand Down
42 changes: 42 additions & 0 deletions pkg/flypg/admin/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,3 +330,45 @@ func GetSetting(ctx context.Context, pg *pgx.Conn, setting string) (*PGSetting,
}
return &out, nil
}

func SetReadOnly(ctx context.Context, conn *pgx.Conn) error {
databases, err := ListDatabases(ctx, conn)
if err != nil {
return err
}

for _, db := range databases {
if db.Name == "repmgr" || db.Name == "postgres" {
continue
}

sql := fmt.Sprintf("ALTER DATABASE %s set default_transaction_read_only = true;", db.Name)
_, err := conn.Exec(ctx, sql)
if err != nil {
return err
}
}

return nil
}

func UnsetReadOnly(ctx context.Context, conn *pgx.Conn) error {
databases, err := ListDatabases(ctx, conn)
if err != nil {
return err
}

for _, db := range databases {
if db.Name == "repmgr" || db.Name == "postgres" {
continue
}

sql := fmt.Sprintf("ALTER DATABASE %s set default_transaction_read_only = false;", db.Name)
_, err := conn.Exec(ctx, sql)
if err != nil {
return err
}
}

return nil
}
Loading