In [None]:
import numpy as np
import torch
import torch.nn as nn
from scipy.io.wavfile import write

class AudioLSTM(nn.Module):
    def __init__(self, input_size=1, hidden_size=128, num_layers=2):
        super(AudioLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, device=x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, device=x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

def create_rhythmic_pattern(sample_rate=16000):
    pattern_length = int(sample_rate * 0.125)
    t = np.linspace(0, 0.125, pattern_length)

    p1 = np.sin(2 * np.pi * 110 * t) * np.exp(-t * 8)
    p2 = np.sin(2 * np.pi * 220 * t) * np.exp(-t * 6)
    p3 = np.sin(2 * np.pi * 165 * t) * (1 + 0.3 * np.sin(2 * np.pi * 10 * t)) * np.exp(-t * 7)
    p4 = 0.1 * np.random.randn(pattern_length)

    return [p1, p2, p3, p4]

device = torch.device("cuda")
print(f"Using device: {device}")
print(f"GPU: {torch.cuda.get_device_name(0)}")

# Create rhythmic training data
sample_rate = 16000
beat_patterns = create_rhythmic_pattern(sample_rate)
rhythm_sequence = [0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 1, 3, 0, 1, 2, 3] * 2
signal = np.concatenate([beat_patterns[b] for b in rhythm_sequence])
signal = signal + 0.1 * np.random.randn(len(signal))

print("Building sequences...")
sequence_length = 200
X, y = [], []
for i in range(len(signal) - sequence_length):
    X.append(signal[i:i + sequence_length])
    y.append(signal[i + sequence_length])

X = torch.FloatTensor(X).unsqueeze(-1).to(device)
y = torch.FloatTensor(y).to(device)

print(f"Dataset: {len(X)} sequences")

# Model setup
model = AudioLSTM(hidden_size=128).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print("Running...")
batch_size = 256
for epoch in range(20):
    total_loss = 0
    for i in range(0, len(X), batch_size):
        batch_x = X[i:i+batch_size]
        batch_y = y[i:i+batch_size]
        
        optimizer.zero_grad()
        outputs = model(batch_x).squeeze()
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    if (epoch + 1) % 5 == 0:
        avg_loss = total_loss / (len(X) // batch_size)
        print(f"Epoch {epoch+1}/20, Loss: {avg_loss:.6f}")

print("Creating audio...")
model.eval()
generated = list(signal[:sequence_length])

with torch.no_grad():
    for i in range(sample_rate * 8):
        input_seq = torch.tensor(generated[-sequence_length:], dtype=torch.float32).unsqueeze(0).unsqueeze(-1).to(device)
        next_sample = model(input_seq).cpu().item()
        if i % (sample_rate // 8) < 100:
            next_sample *= 1.2
        generated.append(next_sample)

output = np.clip(np.array(generated), -1, 1)
write("rhythmic_lstm_voice.wav", sample_rate, (output * 32767).astype(np.int16))
print("Audio saved: rhythmic_lstm_voice.wav")