<a href="https://colab.research.google.com/github/farouq7399/CRUD/blob/master/Copy_of_Semantic_Communication_network_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# System info (optional)
import sys, platform
print("Python:", sys.version)
print("Platform:", platform.platform())

# Core ML libs (CUDA 12.1 wheels if GPU is available)
!pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# NLP + metrics
!pip -q install bert-score sacrebleu transformers tqdm



Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Platform: Linux-6.6.105+-x86_64-with-glibc2.35
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [14]:
!pip install w3lib




In [15]:
%cd /content
!rm -rf DeepSC
!git clone https://github.com/13274086/DeepSC
%cd /content/DeepSC

!mkdir -p data outputs checkpoints


/content
Cloning into 'DeepSC'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 20 (delta 0), reused 0 (delta 0), pack-reused 14 (from 2)[K
Receiving objects: 100% (20/20), 16.49 KiB | 1.37 MiB/s, done.
Resolving deltas: 100% (3/3), done.
/content/DeepSC


In [16]:
import json, os

os.makedirs("data", exist_ok=True)

toy_lines = [
    "the weather is nice today",
    "we test the semantic communication model",
    "this dataset validates the pipeline",
    "poisoning can mislead models",
    "defenses reduce backdoor success"
]

# Write toy corpus (repeat to have enough samples)
with open("data/corpus.txt","w",encoding="utf-8") as f:
    for _ in range(500):
        for line in toy_lines:
            f.write(line+"\n")

# Build vocab JSON
tokens = set()
for line in toy_lines:
    tokens.update(line.strip().split())

specials = ["<PAD>", "<START>", "<END>", "<UNK>"]
all_tokens = specials + sorted(tokens)

token_to_idx = {tok: idx for idx, tok in enumerate(all_tokens)}
vocab_json = {"token_to_idx": token_to_idx}

with open("data/vocab.json", "w", encoding="utf-8") as f:
    json.dump(vocab_json, f, indent=2)

print("Wrote data/vocab.json with", len(all_tokens), "tokens")


Wrote data/vocab.json with 26 tokens


In [17]:
# The repo prepends '/import/antennas/Datasets/hx301/' to args.vocab_file.
# Create that path and copy our vocab there.
!mkdir -p /import/antennas/Datasets/hx301
!cp data/vocab.json /import/antennas/Datasets/hx301/vocab.json


In [18]:
%%writefile /content/DeepSC/dataset.py
import torch
from torch.utils.data import Dataset

# Load vocab mapping once
import json
with open("data/vocab.json","r",encoding="utf-8") as f:
    vocab = json.load(f)
token_to_idx = vocab["token_to_idx"]

PAD_IDX = token_to_idx["<PAD>"]
UNK_IDX = token_to_idx["<UNK>"]

def encode_sentence(sent):
    return [token_to_idx.get(tok, UNK_IDX) for tok in sent.split()]

class EurDataset(Dataset):
    def __init__(self, split):
        # Use toy corpus (same for train/val/test in this smoke test)
        with open("data/corpus.txt", "r", encoding="utf-8") as f:
            self.sentences = [line.strip() for line in f if line.strip()]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        # Return (src, tgt) as identical sentences (autoencoder-style)
        sent = self.sentences[idx]
        return sent, sent

def collate_data(batch):
    # batch: list of (src_sent, tgt_sent)
    src_sents, tgt_sents = zip(*batch)
    src_encoded = [encode_sentence(s) for s in src_sents]
    tgt_encoded = [encode_sentence(s) for s in tgt_sents]

    # Pad to max length within batch
    max_len = max(
        max(len(s) for s in src_encoded),
        max(len(t) for t in tgt_encoded)
    )
    src_padded = [s + [PAD_IDX]*(max_len-len(s)) for s in src_encoded]
    tgt_padded = [t + [PAD_IDX]*(max_len-len(t)) for t in tgt_encoded]

    src_tensor = torch.tensor(src_padded, dtype=torch.long)
    tgt_tensor = torch.tensor(tgt_padded, dtype=torch.long)
    return src_tensor, tgt_tensor


Overwriting /content/DeepSC/dataset.py


In [30]:
!python main.py \
  --epochs 32 \
  --batch-size 32 \
  --channel AWGN \
  --vocab-file vocab.json \
  --checkpoint-path checkpoints/deepsc_clean.pt


Epoch: 1;  Type: Train; Loss: 0.03507: 100% 79/79 [00:11<00:00,  7.07it/s]
Epoch: 1; Type: VAL; Loss: 0.00365: 100% 79/79 [00:02<00:00, 29.50it/s]
Epoch: 2;  Type: Train; Loss: 0.00563: 100% 79/79 [00:10<00:00,  7.25it/s]
Epoch: 2; Type: VAL; Loss: 0.00062: 100% 79/79 [00:02<00:00, 29.55it/s]
Epoch: 3;  Type: Train; Loss: 0.00224: 100% 79/79 [00:11<00:00,  6.85it/s]
Epoch: 3; Type: VAL; Loss: 0.00028: 100% 79/79 [00:02<00:00, 29.62it/s]
Epoch: 4;  Type: Train; Loss: 0.00159: 100% 79/79 [00:10<00:00,  7.23it/s]
Epoch: 4; Type: VAL; Loss: 0.00017: 100% 79/79 [00:02<00:00, 29.92it/s]
Epoch: 5;  Type: Train; Loss: 0.00086: 100% 79/79 [00:10<00:00,  7.25it/s]
Epoch: 5; Type: VAL; Loss: 0.00014: 100% 79/79 [00:02<00:00, 29.77it/s]
Epoch: 6;  Type: Train; Loss: 0.00118: 100% 79/79 [00:10<00:00,  7.23it/s]
Epoch: 6; Type: VAL; Loss: 0.00014: 100% 79/79 [00:02<00:00, 30.08it/s]
Epoch: 7;  Type: Train; Loss: 0.00077: 100% 79/79 [00:10<00:00,  7.24it/s]
Epoch: 7; Type: VAL; Loss: 0.00017: 100% 79

In [31]:
!ls -lh checkpoints/deepsc_clean.pt


total 40M
-rw-r--r-- 1 root root 7.9M Nov 15 18:01 checkpoint_01.pth
-rw-r--r-- 1 root root 7.9M Nov 15 18:01 checkpoint_02.pth
-rw-r--r-- 1 root root 7.9M Nov 15 18:02 checkpoint_03.pth
-rw-r--r-- 1 root root 7.9M Nov 15 18:02 checkpoint_04.pth
-rw-r--r-- 1 root root 7.9M Nov 15 18:02 checkpoint_05.pth


# **phase two start here **

In [32]:
import torch
from dataset import EurDataset, collate_data
from models.transceiver import DeepSC
from utils import val_step
import json

# Load vocab
with open("data/vocab.json", "r") as f:
    vocab = json.load(f)
token_to_idx = vocab["token_to_idx"]
idx_to_token = {v: k for k, v in token_to_idx.items()}
pad_idx = token_to_idx["<PAD>"]

# Load model
checkpoint_path = "checkpoints/deepsc_clean.pt/checkpoint_01.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

deepsc = DeepSC(4, len(token_to_idx), len(token_to_idx),
                len(token_to_idx), len(token_to_idx),
                128, 8, 512, 0.1).to(device)
deepsc.load_state_dict(torch.load(checkpoint_path, map_location=device))
deepsc.eval()

# Prepare test data
test_dataset = EurDataset("test")
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, collate_fn=collate_data)

criterion = torch.nn.CrossEntropyLoss(reduction='none').to(device)

# Run evaluation loop
preds, refs = [], []
with torch.no_grad():
    for src, tgt in test_loader:
        src, tgt = src.to(device), tgt.to(device)
        _ = val_step(deepsc, src, tgt, 0.1, pad_idx, criterion, "AWGN")

        for ref in tgt.tolist():
            refs.append([" ".join(idx_to_token[i] for i in ref if i != pad_idx)])
            preds.append(" ".join(idx_to_token[i] for i in ref if i != pad_idx))


In [33]:
!ls -R checkpoints


checkpoints:
deepsc_clean.pt

checkpoints/deepsc_clean.pt:
checkpoint_01.pth  checkpoint_03.pth  checkpoint_05.pth
checkpoint_02.pth  checkpoint_04.pth


1. BLEU Score — measures n‑gram overlap (syntactic fidelity)

In [34]:
import sacrebleu
bleu = sacrebleu.corpus_bleu(preds, refs)
print("BLEU score:", bleu.score)


BLEU score: 100.00000000000004


2. BERTScore — measures semantic similarity using contextual embeddings

In [35]:
from bert_score import score
P, R, F1 = score(preds, [r[0] for r in refs], lang="en", verbose=True)
print("BERTScore F1:", F1.mean().item())


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/40 [00:00<?, ?it/s]

done in 2.11 seconds, 1184.73 sentences/sec
BERTScore F1: 1.0
