In [247]:
import numpy as np

In [248]:
# XOR dataset
X = np.array([[0., 0.],
              [0., 1.],
              [1., 0.],
              [1., 1.]], dtype=float)
y = np.array([[0.], [1.], [1.], [0.]], dtype=float)
m, d = X.shape
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (4, 2), y shape: (4, 1)


In [249]:
# Single-hidden-layer ReLU network (width=2) that implements XOR exactly.

W = np.array([[1, -1],
              [-1, 1]], dtype=float)
b = np.array([0, 0], dtype=float)
v = np.array([1, 1], dtype=float)

def relu(z):
    return np.maximum(0, z)

def xor_relu(x):
    n= W @ x +b
    h1 = relu(n[0])
    h2 = relu(n[1])
    h=[h1, h2]
    y = v @ h
    return y

for x in X:
    y = xor_relu(x)
    print(f"x={x} -> XOR={y}")

x=[0. 0.] -> XOR=0.0
x=[0. 1.] -> XOR=1.0
x=[1. 0.] -> XOR=1.0
x=[1. 1.] -> XOR=0.0


Conventional Gradient Descent

In [250]:
# XOR dataset
X = np.array([[0., 0.],
              [0., 1.],
              [1., 0.],
              [1., 1.]], dtype=float)
y = np.array([[0.], [1.], [1.], [0.]], dtype=float)
m, d = X.shape
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (4, 2), y shape: (4, 1)


In [251]:
def relu_grad(z):
    return (z > 0).astype(float)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

np.random.seed(0)
W1 = np.random.randn(2, 2)   # (hidden_dim, input_dim)
b1 = np.zeros((2,))          # (hidden_dim,)
W2 = np.random.randn(2, 1)   # (hidden_dim, output_dim)

lr = 0.1
for epoch in range(10000):
    # Forward pass
    z1 = X @ W1.T + b1      # shape (4,2)
    h = relu(z1)            # hidden layer
    z2 = h @ W2     # shape (4,1)
    y_pred = sigmoid(z2)    # output

    # Loss (binary cross-entropy)
    eps = 1e-8
    loss = -np.mean(y*np.log(y_pred+eps) + (1-y)*np.log(1-y_pred+eps))

    # Backprop
    dz2 = (y_pred - y) / m   # (4,1)
    dW2 = h.T @ dz2

    dh = dz2 @ W2.T                   # (4,2)
    dz1 = dh * relu_grad(z1)          # (4,2)
    dW1 = dz1.T @ X
    db1 = np.sum(dz1, axis=0)

    # Update
    W2 -= lr * dW2
    W1 -= lr * dW1
    b1 -= lr * db1

    if epoch % 1000 == 0:
        print(f"epoch {epoch}, loss={loss:.4f}")

# Test
print("\nFinal predictions:")
for x, y in zip(X, y):
    z1 = x @ W1.T + b1
    h = relu(z1)
    z2 = h @ W2
    y_pred = sigmoid(z2)
    print(f"x={x}, target={y[0]}, pred={y_pred[0]:.3f}")

np.set_printoptions(precision=3, suppress=True)

print("\nFinal parameters:")
print("W1 =\n", W1)
print("b1 =", b1)
print("W2 =\n", W2)

epoch 0, loss=0.9195
epoch 1000, loss=0.1859
epoch 2000, loss=0.1779
epoch 3000, loss=0.1759
epoch 4000, loss=0.1751
epoch 5000, loss=0.1747
epoch 6000, loss=0.1743
epoch 7000, loss=0.1742
epoch 8000, loss=0.1740
epoch 9000, loss=0.1739

Final predictions:
x=[0. 0.], target=0.0, pred=0.500
x=[0. 1.], target=1.0, pred=0.999
x=[1. 0.], target=1.0, pred=0.999
x=[1. 1.], target=0.0, pred=0.001

Final parameters:
W1 =
 [[2.246 2.246]
 [3.659 3.659]]
b1 = [-0.022 -3.659]
W2 =
 [[ 3.184]
 [-5.923]]


In [252]:
# XOR dataset
X = np.array([[0., 0.],
              [0., 1.],
              [1., 0.],
              [1., 1.]], dtype=float)
y = np.array([[0.], [1.], [1.], [0.]], dtype=float)
m, d = X.shape
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (4, 2), y shape: (4, 1)


In [253]:
def lrelu(z, a=0.1): return np.where(z > 0, z, a*z)
def lrelu_grad(z, a=0.1): return np.where(z > 0, 1.0, a)

np.random.seed(0)

hidden_dim = 2      # wider hidden layer helps when output bias is removed
W1 = np.random.randn(hidden_dim, d) * np.sqrt(2.0/d)   # He init for (Leaky)ReLU
b1 = np.zeros((hidden_dim,))
W2 = np.random.randn(hidden_dim, 1) * 0.1              # small init
# NOTE: no b2

lr = 0.1
max_epochs = 200000
tol = 1e-7
eps = 1e-12

for epoch in range(max_epochs):
    # Forward
    z1 = X @ W1.T + b1            # (4, hidden_dim)
    h  = lrelu(z1)                # (4, hidden_dim)
    z2 = h @ W2                   # (4, 1)   # <-- no output bias
    y_pred = sigmoid(z2)

    # Loss
    loss = -np.mean(y*np.log(y_pred+eps) + (1-y)*np.log(1-y_pred+eps))

    # Early stop when microscopic
    if loss < tol:
        break

    # Backprop
    dz2 = (y_pred - y) / m                  # (4,1)
    dW2 = h.T @ dz2                         # (hidden_dim,1)

    dh  = dz2 @ W2.T                        # (4,hidden_dim)
    dz1 = dh * lrelu_grad(z1)               # (4,hidden_dim)
    dW1 = dz1.T @ X                         # (hidden_dim, d)
    db1 = np.sum(dz1, axis=0)               # (hidden_dim,)

    # Update
    W2 -= lr * dW2
    W1 -= lr * dW1
    b1 -= lr * db1

# Report
np.set_printoptions(precision=6, suppress=True)
print(f"Stopped with loss={loss:.10f}")
print("\nFinal parameters:")
print("W1 =\n", W1)
print("b1 =", b1)
print("W2 =\n", W2)

print("\nPredictions:")
for i in range(len(X)):
    z1 = X[i] @ W1.T + b1
    h  = lrelu(z1)
    z2 = h @ W2
    p  = sigmoid(z2).item()
    print(f"x={X[i]}, target={y[i,0]:.1f}, pred_prob={p:.9f}, pred={int(p>=0.5)}")

Stopped with loss=0.3430678091

Final parameters:
W1 =
 [[1.687636 1.685144]
 [2.376624 2.375448]]
b1 = [-0.006962 -2.37311 ]
W2 =
 [[ 1.55871 ]
 [-3.301613]]

Predictions:
x=[0. 0.], target=0.0, pred_prob=0.686202289, pred=1
x=[0. 1.], target=1.0, pred_prob=0.931379771, pred=1
x=[1. 0.], target=1.0, pred_prob=0.931379771, pred=1
x=[1. 1.], target=0.0, pred_prob=0.068620229, pred=0


Gurobipy

In [254]:
import gurobipy as gp
from gurobipy import GRB

In [255]:
# XOR dataset
X = np.array([[0., 0.],
              [0., 1.],
              [1., 0.],
              [1., 1.]], dtype=float)
y = np.array([[0.], [1.], [1.], [0.]], dtype=float)
N, d = X.shape
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (4, 2), y shape: (4, 1)


In [256]:
# width of a single hidden layer
l1=2
M=10000000.0 # minimum Big-M to achieve feasibility (empirically) (0*5)

m = gp.Model()

m.Params.NonConvex = 2

# Bounds
Bw = 10.0     # |W_ij| <= Bw
Bb = 10.0     # |b_i|  <= Bb
Bv = 10.0     # |v_i|  <= Bv

Ms = Bw * Bv
M = d*Bw + Bb

W = m.addVars(l1, d, lb=-Bw, ub=Bw, vtype=GRB.CONTINUOUS, name="W")
b = m.addVars(l1,lb=-Bb, ub=Bb, vtype=GRB.CONTINUOUS, name="b")
v = m.addVars(l1, lb=-Bv, ub=Bv, vtype=GRB.CONTINUOUS, name="v")

p = m.addVars(N, l1, lb=0.0, vtype=GRB.CONTINUOUS, name="p")
q = m.addVars(N, l1, lb=0.0, vtype=GRB.CONTINUOUS, name="q")
z = m.addVars(N, l1, vtype=GRB.BINARY, name="z")

s = m.addVars(l1, d, lb=-Ms, ub=Ms, vtype=GRB.CONTINUOUS, name="s")
t = m.addVars(l1, d, vtype=GRB.BINARY, name="t")

m.setObjective(gp.quicksum(t[i,j] for i in range(l1) for j in range(d)), GRB.MINIMIZE)

for n in range(N):
    for i in range(l1):
        m.addConstr(
            gp.quicksum(W[i,j] * X[n, j] for j in range(d)) + b[i]
            == p[n, i] - q[n, i],
            name=f"affine_{n}_{i}"
        )

for n in range(N):
    for i in range(l1):
        m.addConstr(p[n,i] <= M * (1 - z[n,i]), name=f"pBigM_{n}_{i}")
        m.addConstr(q[n,i] <= M * z[n,i],       name=f"qBigM_{n}_{i}")

for n in range(N):
    m.addConstr(
        gp.quicksum(p[n,i] * v[i] for i in range(l1)) == y[n,0],
        name=f"out_{n}"
    )

for i in range(l1):
    for j in range(d):
        m.addConstr(s[i,j] == W[i,j] * v[i],       name=f"s_def_{i}_{j}")
        m.addConstr( s[i,j] <=  Ms * t[i,j],        name=f"s_up_{i}_{j}")
        m.addConstr(-s[i,j] <=  Ms * t[i,j],        name=f"s_lo_{i}_{j}")

m.optimize()

Set parameter NonConvex to value 2
Gurobi Optimizer version 12.0.2 build v12.0.2rc0 (mac64[arm] - Darwin 24.5.0 24F74)

CPU model: Apple M3
Thread count: 8 physical cores, 8 logical processors, using up to 8 threads

Non-default parameters:
NonConvex  2

Optimize a model with 32 rows, 40 columns and 80 nonzeros
Model fingerprint: 0x3f1ab52a
Model has 8 quadratic constraints
Variable types: 28 continuous, 12 integer (12 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+02]
  QMatrix range    [1e+00, 1e+00]
  QLMatrix range   [1e+00, 1e+00]
  Objective range  [1e+00, 1e+00]
  Bounds range     [1e+00, 1e+02]
  RHS range        [3e+01, 3e+01]
  QRHS range       [1e+00, 1e+00]
Presolve removed 2 rows and 2 columns
Presolve time: 0.00s
Presolved: 82 rows, 47 columns, 216 nonzeros
Presolved model has 12 bilinear constraint(s)

Solving non-convex MIQCP

Variable types: 35 continuous, 12 integer (12 binary)

Root relaxation: objective 0.000000e+00, 41 iterations, 0.00 seconds (0.00 

In [257]:
if m.status not in [GRB.OPTIMAL, GRB.SUBOPTIMAL]:
    print(f"Model status: {m.status}")
else:
    print(f"Objective value = {m.objVal:g}")
    for vv in m.getVars():
        print(f"{vv.VarName} = {vv.X:g}")

Objective value = 4
W[0,0] = -0.386436
W[0,1] = -0.386436
W[1,0] = -6.44507
W[1,1] = -6.44507
b[0] = 0.772872
b[1] = 8.59938
v[0] = 5.18628
v[1] = -0.466118
p[0,0] = 0.772872
p[0,1] = 8.59938
p[1,0] = 0.386436
p[1,1] = 2.15431
p[2,0] = 0.386436
p[2,1] = 2.15431
p[3,0] = 0
p[3,1] = 0
q[0,0] = 0
q[0,1] = 0
q[1,0] = 0
q[1,1] = 0
q[2,0] = 0
q[2,1] = 0
q[3,0] = 0
q[3,1] = 4.29075
z[0,0] = 0
z[0,1] = 0
z[1,0] = 0
z[1,1] = 0
z[2,0] = 0
z[2,1] = 0
z[3,0] = 0
z[3,1] = 1
s[0,0] = -2.00416
s[0,1] = -2.00416
s[1,0] = 3.00416
s[1,1] = 3.00416
t[0,0] = 1
t[0,1] = 1
t[1,0] = 1
t[1,1] = 1


In [258]:
solW = m.getAttr("X", W); solb = m.getAttr("X", b); solv = m.getAttr("X", v)
solp = m.getAttr("X", p); solq = m.getAttr("X", q); solz = m.getAttr("X", z)
sols = m.getAttr("X", s); solt = m.getAttr("X", t)

W_val = np.array([[solW[i,j] for j in range(d)] for i in range(l1)])
b_val = np.array([solb[i] for i in range(l1)])
v_val = np.array([solv[i] for i in range(l1)])

p_val = np.array([[solp[n,i] for i in range(l1)] for n in range(N)])
q_val = np.array([[solq[n,i] for i in range(l1)] for n in range(N)])
z_val = np.array([[solz[n,i] for i in range(l1)] for n in range(N)], dtype=int)

s_val = np.array([[sols[i,j] for j in range(d)] for i in range(l1)])
t_val = np.array([[solt[i,j] for j in range(d)] for i in range(l1)], dtype=int)

np.set_printoptions(suppress=True, linewidth=120, precision=6)
print("W =\n", W_val)
print("b =", b_val)
print("v =", v_val)
print("p =\n", p_val)
print("q =\n", q_val)
print("z =\n", z_val)
print("s =\n", s_val)
print("t =\n", t_val)

W =
 [[-0.386436 -0.386436]
 [-6.445067 -6.445067]]
b = [0.772872 8.59938 ]
v = [ 5.186281 -0.466118]
p =
 [[0.772872 8.59938 ]
 [0.386436 2.154313]
 [0.386436 2.154313]
 [0.       0.      ]]
q =
 [[0.       0.      ]
 [0.       0.      ]
 [0.       0.      ]
 [0.       4.290755]]
z =
 [[0 0]
 [0 0]
 [0 0]
 [0 1]]
s =
 [[-2.004165 -2.004165]
 [ 3.004165  3.004165]]
t =
 [[1 1]
 [1 1]]
