In [1]:
# %% Imports
import os, time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import librosa

In [2]:
print("CUDA available?", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


CUDA available? True
Device count: 1
Device name: NVIDIA A10G


## Naive model 

In [3]:


# ----------------- Config (EFS paths, batch, etc.) -----------------
EFS_ROOT   = "/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling"
MODEL_PATH = f"{EFS_ROOT}/models/model9.model"
TRAIN_PATH = f"{EFS_ROOT}/data/demo/train.parquet"
TEST_PATH  = f"{EFS_ROOT}/data/demo/test.parquet"

BATCH_SIZE   = 64
EPOCHS       = 10
LR           = 1e-3
TARGET_SR    = 16000
N_MELS       = 40
N_FFT        = 400
HOP_LENGTH   = 160
MAX_LEN      = 200     # frames (pad / truncate to this)

NUM_WORKERS  = 2       # set to 0 if you see multiprocessing/pickling issues
PIN_MEMORY   = True

# ----------------- Model -----------------
class LanNetBinary(nn.Module):
    def __init__(self, input_dim=40, hidden_dim=512, num_layers=2):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim,
                          num_layers=num_layers, batch_first=True)
        self.linear2 = nn.Linear(hidden_dim, 192)
        self.linear3 = nn.Linear(192, 2)   # binary output
    
    def forward(self, x):
        # x: (B, T, F)
        out, _ = self.gru(x)               # (B, T, H)
        last = out[:, -1, :]               # (B, H)
        x = self.linear2(last)             # (B, 192)
        x = self.linear3(x)                # (B, 2)
        return x

# ----------------- Feature extraction (from arrays) -----------------
def ensure_mono(y):
    y = np.asarray(y)
    if y.ndim == 2:
        # (channels, samples) or (samples, channels) — make best guess
        if y.shape[0] < y.shape[1]:  # channels likely first
            y = y.mean(axis=0)
        else:
            y = y.mean(axis=1)
    return y.astype(np.float32, copy=False)

def resample_if_needed(y, sr_in, sr_out=TARGET_SR):
    if sr_in is None or sr_in == 0:
        sr_in = sr_out
    if sr_in == sr_out:
        return y, sr_out
    return librosa.resample(y, orig_sr=sr_in, target_sr=sr_out), sr_out

def fbanks_from_array(y, sr=TARGET_SR,
                      n_mels=N_MELS, n_fft=N_FFT,
                      hop_length=HOP_LENGTH, max_len=MAX_LEN):
    # y must be mono, float32, sr == TARGET_SR
    mel = librosa.feature.melspectrogram(y=y, sr=sr,
                                         n_mels=n_mels,
                                         n_fft=n_fft,
                                         hop_length=hop_length,
                                         power=2.0)
    fbanks = librosa.power_to_db(mel).T   # (time, 40)
    T = fbanks.shape[0]
    if T < max_len:
        fbanks = np.pad(fbanks, ((0, max_len - T), (0, 0)), mode="constant")
    else:
        fbanks = fbanks[:max_len, :]
    return fbanks.astype(np.float32)

# ----------------- Dataset -----------------
class FbankArrayDataset(Dataset):
    """
    Expects df columns:
      - 'audio': 1D (samples,) or 2D array (mix); dtype numeric
      - optional 'sampling_rate': int; if missing, assume TARGET_SR
      - label column: either 'label' (0/1 or 'shanghai'/other) or 'dialect'
    """
    def __init__(self, df, label_col=None):
        self.df = df.reset_index(drop=True)
        if label_col is None:
            self.label_col = "label" if "label" in self.df.columns else "dialect"
        else:
            self.label_col = label_col

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        y = ensure_mono(row["audio"])
        sr = row["sampling_rate"] if "sampling_rate" in row and not pd.isna(row["sampling_rate"]) else TARGET_SR
        y, _ = resample_if_needed(y, int(sr), TARGET_SR)

        x = fbanks_from_array(y, sr=TARGET_SR)          # (MAX_LEN, 40)

        label = row[self.label_col]
        if isinstance(label, str):
            label_bin = 1 if label.lower() == "shanghai" else 0
        else:
            label_bin = int(label)

        return torch.from_numpy(x), torch.tensor(label_bin, dtype=torch.long)

# ----------------- Load dataframes (from EFS) -----------------
train_df = pd.read_parquet(TRAIN_PATH)
test_df  = pd.read_parquet(TEST_PATH)

# create/normalize binary label if needed
if "label" not in train_df.columns:
    if "dialect" in train_df.columns:
        train_df["label"] = (train_df["dialect"].str.lower() == "shanghai").astype(int)
        test_df["label"]  = (test_df["dialect"].str.lower() == "shanghai").astype(int)
    else:
        raise ValueError("Expected 'label' or 'dialect' in parquet with audio arrays.")

# ----------------- Datasets & DataLoaders -----------------
train_ds = FbankArrayDataset(train_df, label_col="label")
test_ds  = FbankArrayDataset(test_df,  label_col="label")

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)

# ----------------- Load pretrained GRU weights (from EFS) -----------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Training on:", device)

binary_model = LanNetBinary().to(device)

# Load checkpoint
orig_state = torch.load(MODEL_PATH, map_location="cpu")

# Map 'layer1.GRU.*' -> 'gru.*' where shapes match
with torch.no_grad():
    own_state = binary_model.state_dict()
    copied = 0
    for k, v in orig_state.items():
        if k.startswith("layer1.GRU."):
            new_k = k.replace("layer1.GRU.", "gru.")
            if new_k in own_state and own_state[new_k].shape == v.shape:
                own_state[new_k].copy_(v)
                copied += 1
    print(f"Copied {copied} GRU tensors from pretrained checkpoint.")

# Freeze GRU initially (train head first)
for p in binary_model.gru.parameters():
    p.requires_grad = False

# ----------------- Optimizer / Loss -----------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, binary_model.parameters()), lr=LR)

# ----------------- Eval helper -----------------
def evaluate(model, loader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            logits = model(xb)
            preds  = logits.argmax(dim=1)
            correct += (preds == yb).sum().item()
            total   += yb.numel()
    return correct / max(total, 1)

# ----------------- Train loop ------------------
# %% Training with checkpoint saving
checkpoint_dir = f"{EFS_ROOT}/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)
best_acc = 0.0
best_path = os.path.join(checkpoint_dir, "best_model.pth")

history = {"loss": [], "val_acc": [], "epoch_secs": []}

for epoch in range(1, EPOCHS + 1):
    binary_model.train()
    t0 = time.time()
    running_loss = 0.0

    for xb, yb in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        logits = binary_model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / max(len(train_loader), 1)
    val_acc = evaluate(binary_model, test_loader, device)
    dt = time.time() - t0

    history["loss"].append(avg_loss)
    history["val_acc"].append(val_acc)
    history["epoch_secs"].append(dt)

    print(f"Epoch {epoch:02d} | loss={avg_loss:.4f} | val_acc={val_acc:.2%} | {dt:.1f}s")

    # ----------------- Save best checkpoint -----------------
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(binary_model.state_dict(), best_path)
        print(f"New best model saved at epoch {epoch} (val_acc={val_acc:.2%})")


# (Optional) Unfreeze GRU to fine-tune end-to-end with a smaller LR:
# for p in binary_model.gru.parameters():
#     p.requires_grad = True
# optimizer = optim.Adam(binary_model.parameters(), lr=5e-4)
# ...train a few more epochs...


Training on: cuda
Copied 8 GRU tensors from pretrained checkpoint.


                                                             

Epoch 01 | loss=0.3320 | val_acc=93.75% | 23.2s
New best model saved at epoch 1 (val_acc=93.75%)


                                                             

Epoch 02 | loss=0.2277 | val_acc=90.00% | 12.2s


                                                             

Epoch 03 | loss=0.2166 | val_acc=93.75% | 12.1s


                                                             

Epoch 04 | loss=0.1886 | val_acc=90.00% | 12.2s


                                                             

Epoch 05 | loss=0.1811 | val_acc=91.25% | 12.1s


                                                             

Epoch 06 | loss=0.1860 | val_acc=90.00% | 12.1s


                                                             

Epoch 07 | loss=0.1795 | val_acc=91.25% | 12.1s


                                                             

Epoch 08 | loss=0.1698 | val_acc=91.25% | 12.0s


                                                             

Epoch 09 | loss=0.1677 | val_acc=92.50% | 11.9s


                                                              

Epoch 10 | loss=0.1613 | val_acc=91.25% | 12.1s


## Visualizations

In [None]:
# %% Visualizations: curves + confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
import torch

# 1) Training curves
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(history["loss"], marker="o")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss")

plt.subplot(1,2,2)
plt.plot(history["val_acc"], marker="o")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy")

plt.tight_layout()
plt.show()

# 2) Confusion matrix on test set
all_preds, all_labels = [], []
binary_model.eval()
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device, non_blocking=True)
        logits = binary_model(xb)
        preds = logits.argmax(1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

labels = ["Not-Shanghai", "Shanghai"]
cm = confusion_matrix(all_labels, all_preds, labels=[0,1])

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

print("Classification report:")
print(classification_report(all_labels, all_preds, target_names=labels, digits=4))


## Export to HF

Run after training; reads existing variables (paths, config) and exports 

In [8]:
# %% Export model + configs + dataset into HF-style folders on EFS
import os, json, shutil
from pathlib import Path

from safetensors.torch import save_file as save_safetensors  # pip install safetensors

# --------- Inputs (edit names) ---------
MODEL_NAME = "shanghai-binary"
MODEL_DIR_EFS = os.path.join(EFS_ROOT, "hf", "models", MODEL_NAME)
DS_NAME = "shanghai-binary"
DS_DIR_EFS = os.path.join(EFS_ROOT, "hf", "datasets", DS_NAME)
CKPT_DIR = "/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/checkpoints"
BEST_CKPT = os.path.join(CKPT_DIR, "best_model.pth")   # from your training loop
TRAIN_SRC = TRAIN_PATH                                        # parquet you trained on
TEST_SRC = TEST_PATH                                         # parquet you evaluated on

os.makedirs(MODEL_DIR_EFS, exist_ok=True)
os.makedirs(os.path.join(DS_DIR_EFS, "data"), exist_ok=True)

# --------- 1) Save model weights in safetensors (recommended) ---------
state_dict = torch.load(BEST_CKPT, map_location="cpu")
weights_path = os.path.join(MODEL_DIR_EFS, "model.safetensors")
save_safetensors(state_dict, weights_path)

# (Optionally also save PyTorch .bin)
# torch.save(state_dict, os.path.join(MODEL_DIR_EFS, "pytorch_model.bin"))

# --------- 2) Save model config (architecture) ---------
config = {
    "_name_or_path": MODEL_NAME,
    "model_type": "gru-audio-binary",
    "input_dim": 40,
    "hidden_dim": 512,
    "num_layers": 2,
    "num_labels": 2,
    "classifier_dims": [192, 2],
    "pooling": "last_timestep"
}
with open(os.path.join(MODEL_DIR_EFS, "config.json"), "w") as f:
    json.dump(config, f, indent=2)

# --------- 3) Save preprocessor config (feature extraction settings) ---------
preproc = {
    "feature_type": "fbank",
    "sampling_rate": TARGET_SR,
    "n_mels": N_MELS,
    "n_fft": N_FFT,
    "hop_length": HOP_LENGTH,
    "max_len_frames": MAX_LEN,
    "log_db": True,
    "mono": True
}
with open(os.path.join(MODEL_DIR_EFS, "preprocessor_config.json"), "w") as f:
    json.dump(preproc, f, indent=2)

# --------- 4) Save label mapping ---------
label_map = { "0": "not-shanghai", "1": "shanghai" }
with open(os.path.join(MODEL_DIR_EFS, "label_mapping.json"), "w") as f:
    json.dump(label_map, f, indent=2)

# --------- 5) Minimal model README (model card-style) ---------
model_readme = f"""# {MODEL_NAME}

Binary classifier: **Shanghai** vs **Not-Shanghai** (audio FBANK → GRU → MLP).

## Files
- `model.safetensors` — PyTorch weights (safetensors)
- `config.json` — model architecture
- `preprocessor_config.json` — audio feature extraction settings
- `label_mapping.json` — index → label

## Inference (PyTorch)
```python
import torch, json, numpy as np, librosa
from safetensors.torch import load_file as load_safetensors

# Load config
import json, os
model_dir = "./hf/models/{MODEL_NAME}"
cfg = json.load(open(os.path.join(model_dir, "config.json")))
pp  = json.load(open(os.path.join(model_dir, "preprocessor_config.json")))
lm  = json.load(open(os.path.join(model_dir, "label_mapping.json")))

# Define the model class you trained (LanNetBinary)
# (Same as in your training notebook)
class LanNetBinary(torch.nn.Module):
    def __init__(self, input_dim=40, hidden_dim=512, num_layers=2):
        super().__init__()
        self.gru = torch.nn.GRU(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.linear2 = torch.nn.Linear(hidden_dim, 192)
        self.linear3 = torch.nn.Linear(192, 2)
    def forward(self, x):
        out, _ = self.gru(x)
        last = out[:, -1, :]
        x = self.linear2(last)
        x = self.linear3(x)
        return x

# Load weights
model = LanNetBinary(cfg["input_dim"], cfg["hidden_dim"], cfg["num_layers"])
sd = load_safetensors(os.path.join(model_dir, "model.safetensors"))
model.load_state_dict(sd, strict=True)
model.eval()

# Feature extraction should match preprocessor_config.json
def fbanks_from_array(y, sr=pp["sampling_rate"], n_mels=pp["n_mels"], n_fft=pp["n_fft"], hop_length=pp["hop_length"], max_len=pp["max_len_frames"]):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length, power=2.0)
    fbanks = librosa.power_to_db(mel).T
    T = fbanks.shape[0]
    if T < max_len:
        import numpy as np
        fbanks = np.pad(fbanks, ((0, max_len - T), (0, 0)), mode="constant")
    else:
        fbanks = fbanks[:max_len, :]
    return torch.tensor(fbanks, dtype=torch.float32).unsqueeze(0)  # (1, T, F)

# Example: predict from a waveform array "y" at 16kHz
# y, _ = librosa.load("example.wav", sr=pp["sampling_rate"])
# x = fbanks_from_array(y)
# with torch.no_grad():
#     logits = model(x)
#     pred = int(torch.argmax(logits, dim=1))
#     print(lm[str(pred)])
"""
with open(os.path.join(MODEL_DIR_EFS, "README.md"), "w") as f:
    f.write(model_readme)


# --------- 6) Copy dataset splits & write dataset README ---------
shutil.copy2(TRAIN_SRC, os.path.join(DS_DIR_EFS, "data", "train.parquet"))
shutil.copy2(TEST_SRC, os.path.join(DS_DIR_EFS, "data", "test.parquet"))
ds_readme = f"""# {DS_NAME} dataset

Train/test splits for Shanghai vs Not-Shanghai binary classification.
Contents
data/train.parquet
data/test.parquet
Each row contains:
audio: float array (mono)
sampling_rate: 16000
dialect/label: label (Shanghai=1, else 0)

"""

with open(os.path.join(DS_DIR_EFS, "README.md"), "w") as f:
    f.write(ds_readme)
    print("Export complete:")
    print("Model dir:", MODEL_DIR_EFS)
    print("Dataset dir:", DS_DIR_EFS)


Export complete:
Model dir: /mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf/models/shanghai-binary
Dataset dir: /mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf/datasets/shanghai-binary


In [9]:
!du -sh /mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf
!du -h --max-depth=1 /mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf
!ls -lhR /mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf


911M	/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf
9.7M	/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf/models
901M	/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf/datasets
911M	/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf
/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf:
total 8.0K
drwxr-xr-x 3 nobody nogroup 6.0K Sep 15 02:43 datasets
drwxr-xr-x 3 nobody nogroup 6.0K Sep 15 02:43 models

/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf/datasets:
total 4.0K
drwxr-xr-x 3 nobody nogroup 6.0K Sep 15 02:43 shanghai-binary

/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf/datasets/shanghai-binary:
total 8.0K
-rw-r--r-- 1 nobody nogroup  252 Sep 15 02:43 README.md
drwxr-xr-x 2 no