<a href="https://colab.research.google.com/github/chetAnrAo1213/ML-Translation/blob/main/C_To_Cpp_codebert_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print(tokenizer.tokenize("for(i=0;i<n;i++)"))
print(tokenizer.tokenize('printf("Hello-world")'))

['for', '(', 'i', '=', '0', ';', 'i', '<', 'n', ';', 'i', '++)']
['printf', '("', 'Hello', '-', 'world', '")']


In [None]:
import torch
import numpy as np
import math
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split


# 1. OS device
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_length):
        super().__init__()
        self.d_model = d_model
        self.pe = self.create_positional_encoding(max_length)

    def create_positional_encoding(self, max_length):
        position = torch.arange(max_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, self.d_model, 2) * (-math.log(10000.0) / self.d_model))
        pe = torch.zeros(max_length, self.d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

class CppTranslationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        c_code = str(self.data.iloc[idx]['C_Code'])
        cpp_code = str(self.data.iloc[idx]['CPP_Code'])

        c_tokens = self.tokenizer.encode(c_code, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        cpp_tokens = self.tokenizer.encode(cpp_code, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')

        return c_tokens.squeeze(0), cpp_tokens.squeeze(0)


# 2. Load Dataset
df = pd.read_csv("/content/drive/MyDrive/AI-Ml Translation/c_to_cpp_dataset 75.csv")
train_data, val_data = train_test_split(df, test_size=0.1, random_state=42)


tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
max_length = 128

train_data = train_data.sample(n=5000, random_state=42)
train_dataset = CppTranslationDataset(train_data, tokenizer, max_length)
val_dataset = CppTranslationDataset(val_data, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

print("Data preprocessing completed.")

# 3.Transformer Model

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, ff_dim, max_length):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.position_encoder = PositionalEncoding(d_model, max_length)

        self.encoder_layer = nn.TransformerEncoderLayer(d_model, num_heads, ff_dim, dropout=0.2, batch_first=True)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers)

        self.decoder_layer = nn.TransformerDecoderLayer(d_model, num_heads, ff_dim, dropout=0.2, batch_first=True)
        self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers)

        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src_emb = self.position_encoder(self.embedding(src))
        tgt_emb = self.position_encoder(self.embedding(tgt))

        memory = self.encoder(src_emb)
        output = self.decoder(tgt_emb, memory)
        return self.fc_out(output)

# 4. Train
d_model = 128
num_heads = 8
num_layers = 4
ff_dim = 256
vocab_size = len(tokenizer.get_vocab())

transformer_model = TransformerModel(vocab_size, d_model, num_heads, num_layers, ff_dim, max_length).to(get_device())

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.AdamW(transformer_model.parameters(), lr=1e-4)

print("Training Started")

num_epochs = 10
for epoch in range(num_epochs):
    transformer_model.train()
    total_loss = 0

    for c_tokens, cpp_tokens in train_loader:
        c_tokens, cpp_tokens = c_tokens.to(get_device()), cpp_tokens.to(get_device())

        optimizer.zero_grad()
        outputs = transformer_model(c_tokens, cpp_tokens[:, :-1].contiguous())

        loss = criterion(outputs.view(-1, outputs.size(-1)), cpp_tokens[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")

print("Training Completed")

# 5. Model Evaluation
transformer_model.eval()
total_loss = 0

with torch.no_grad():
    for c_tokens, cpp_tokens in val_loader:
        c_tokens, cpp_tokens = c_tokens.to(get_device()), cpp_tokens.to(get_device())
        outputs = transformer_model(c_tokens, cpp_tokens[:, :-1].contiguous())

        loss = criterion(outputs.view(-1, outputs.size(-1)), cpp_tokens[:, 1:].reshape(-1))
        total_loss += loss.item()

print(f"Validation Loss: {total_loss / len(val_loader):.4f}")


# 6. Translation Function (Beam Search)
def translate_c_to_cpp(c_code, beam_size=5):
    transformer_model.eval()
    tokens = tokenizer.encode(c_code, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    tokens = tokens.to(get_device())

    tgt_input = torch.full((1, 1), tokenizer.cls_token_id, dtype=torch.long, device=get_device())

    with torch.no_grad():
        for _ in range(max_length):
            output_tokens = transformer_model(tokens, tgt_input)

            probs = F.softmax(output_tokens[:, -1, :], dim=-1)
            top_k_probs, top_k_indices = torch.topk(probs, beam_size, dim=-1)


            next_token = top_k_indices[0, torch.multinomial(top_k_probs[0], num_samples=1)].unsqueeze(0)

            if next_token.item() == tokenizer.sep_token_id:
                break

            tgt_input = torch.cat((tgt_input, next_token), dim=1)

    return tokenizer.decode(tgt_input.squeeze(0).tolist(), skip_special_tokens=True)

Data preprocessing completed.
Training Started
Epoch [1/10], Loss: 5.8645
Epoch [2/10], Loss: 1.1070
Epoch [3/10], Loss: 0.2342
Epoch [4/10], Loss: 0.0853
Epoch [5/10], Loss: 0.0453
Epoch [6/10], Loss: 0.0286
Epoch [7/10], Loss: 0.0196
Epoch [8/10], Loss: 0.0142
Epoch [9/10], Loss: 0.0107
Epoch [10/10], Loss: 0.0083
Training Completed
Validation Loss: 0.0034


In [None]:
c_sample = 'printf("Hello World");'
cpp_output = translate_c_to_cpp(c_sample)
print("Generated C++ Code:\n", cpp_output)

Generated C++ Code:
 std::cout << "Hello World std::cout << "(x <( x;
