# Training Steps

- Train on language translation
- Train on framework translation

## Language Translation

In [1]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, CodeLlamaTokenizer, CodeLlamaForSeq2SeqLM
from datasets import Dataset

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the dataset
data_path = './data/raw/samples_dart_javascript.csv'
df = pd.read_csv(data_path)

# Prepare the dataset for the Hugging Face `datasets` library
df['translation'] = df.apply(lambda row: {'dart': row['dart'], 'javascript': row['javascript']}, axis=1)
dataset = Dataset.from_pandas(df[['translation']])

# Function to preprocess the data
def preprocess_function(examples):
    inputs = [f"translate: {ex['dart']}" for ex in examples['translation']]
    targets = [ex['javascript'] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length").input_ids

    model_inputs['labels'] = labels
    return model_inputs

# Load the tokenizer and model
tokenizer = CodeLlamaTokenizer.from_pretrained("facebook/incoder-1B")
model = CodeLlamaForSeq2SeqLM.from_pretrained("facebook/incoder-1B")

# Preprocess the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    fp16=torch.cuda.is_available(),
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Check if the Trainer is using the right device
print(f"Trainer device: {trainer.args.device}")

ImportError: cannot import name 'CodeLlamaForSeq2SeqLM' from 'transformers' (c:\Users\Ayush\anaconda3\Lib\site-packages\transformers\__init__.py)

In [14]:
# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained('./models/language-fine-tuned-model')
tokenizer.save_pretrained('./models/language-fine-tuned-model')

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/53 [00:00<?, ?it/s]

{'eval_loss': 0.19254176318645477, 'eval_runtime': 428.9199, 'eval_samples_per_second': 0.494, 'eval_steps_per_second': 0.124, 'epoch': 1.0}


  0%|          | 0/53 [00:00<?, ?it/s]

{'eval_loss': 0.0953802615404129, 'eval_runtime': 541.6172, 'eval_samples_per_second': 0.391, 'eval_steps_per_second': 0.098, 'epoch': 2.0}


  0%|          | 0/53 [00:00<?, ?it/s]

{'eval_loss': 0.07500644028186798, 'eval_runtime': 602.5238, 'eval_samples_per_second': 0.352, 'eval_steps_per_second': 0.088, 'epoch': 3.0}
{'train_runtime': 4876.6808, 'train_samples_per_second': 0.13, 'train_steps_per_second': 0.033, 'train_loss': 0.4858616762940989, 'epoch': 3.0}


('./models/fine-tuned-model\\tokenizer_config.json',
 './models/fine-tuned-model\\special_tokens_map.json',
 './models/fine-tuned-model\\vocab.json',
 './models/fine-tuned-model\\merges.txt',
 './models/fine-tuned-model\\added_tokens.json')

In [15]:
# Test the fine-tuned model with a sample translation
code = df['translation'][3]['dart']
input_text = f"translate: {code}"
print(f"Input code:\n{code}")
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate the translation
with torch.no_grad():
    output_ids = model.generate(input_ids, max_length=512)

# Decode the output
translated_code = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"Translated code:\n{translated_code}")

Input code:
  double findMedianSortedArrays(List<int> nums1, List<int> nums2) {
    List<int> merged = [];
    int i = 0, j = 0;
  
    while (i < nums1.length && j < nums2.length) {
      if (nums1[i] < nums2[j]) {
        merged.add(nums1[i++]);
      } else {
        merged.add(nums2[j++]);
      }
    }
  
    while (i < nums1.length) {
      merged.add(nums1[i++]);
    }
  
    while (j < nums2.length) {
      merged.add(nums2[j++]);
    }
  
    int n = merged.length;
    if (n % 2 == 0) {
      return (merged[n ~/ 2 - 1] + merged[n ~/ 2]) / 2;
    } else {
      return merged[n ~/ 2].toDouble();
    }
  }
  
Translated code:
  var findMedianSortedArrays = function(nums1, nums2) {
    let merged = [];
    let i = 0, j = 0;
  
    while (i < nums1.length && j < nums2.length) {
      if (nums1[i] < nums2[j]) {
        merged.push(nums1[i++]);
      } else {
        merged.push(nums2[j++]);
      }
    }
  
    while (i < nums1.length) {
      merged.push(nums1[i]);
    }
  
    whi

### Load Language Model

In [22]:
# Load the saved model
m = T5ForConditionalGeneration.from_pretrained('./models/language-fine-tuned-model').to(device)
t = RobertaTokenizer.from_pretrained('./models/language-fine-tuned-model')

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

## Framework Translation

In [16]:
# Load the dataset
data_path = './data/raw/samples_flutter_react-native.csv'
df = pd.read_csv(data_path)

# Prepare the dataset for the Hugging Face `datasets` library
df['translation'] = df.apply(lambda row: {'dart': row['flutter'], 'javascript': row['react-native']}, axis=1)
dataset = Dataset.from_pandas(df[['translation']])

# Preprocess the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    fp16=torch.cuda.is_available(),
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Check if the Trainer is using the right device
print(f"Trainer device: {trainer.args.device}")

Map:   0%|          | 0/35 [00:00<?, ? examples/s]



Trainer device: cuda:0


In [17]:
# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained('./models/framework-fine-tuned-model')
tokenizer.save_pretrained('./models/framework-fine-tuned-model')

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 1.8296769857406616, 'eval_runtime': 26.5886, 'eval_samples_per_second': 1.316, 'eval_steps_per_second': 0.338, 'epoch': 1.0}


  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 1.5303112268447876, 'eval_runtime': 32.1043, 'eval_samples_per_second': 1.09, 'eval_steps_per_second': 0.28, 'epoch': 2.0}


  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 1.434829592704773, 'eval_runtime': 19.3871, 'eval_samples_per_second': 1.805, 'eval_steps_per_second': 0.464, 'epoch': 3.0}
{'train_runtime': 465.7138, 'train_samples_per_second': 0.225, 'train_steps_per_second': 0.058, 'train_loss': 2.1653288382071034, 'epoch': 3.0}


('./models/framework-fine-tuned-model\\tokenizer_config.json',
 './models/framework-fine-tuned-model\\special_tokens_map.json',
 './models/framework-fine-tuned-model\\vocab.json',
 './models/framework-fine-tuned-model\\merges.txt',
 './models/framework-fine-tuned-model\\added_tokens.json')

In [20]:
# Test the fine-tuned model with a sample translation
code = df['translation'][8]['dart']
input_text = f"translate: {code}"
print(f"Input code:\n{code}")
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate the translation
with torch.no_grad():
    output_ids = model.generate(input_ids, max_length=512)

# Decode the output
translated_code = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"Translated code:\n{translated_code}")

Input code:
  import 'package:flutter/material.dart';
  import 'package:image_picker/image_picker.dart';
  
  void main() => runApp(MyApp());
  
  class MyApp extends StatelessWidget {
    @override
    Widget build(BuildContext context) {
      return MaterialApp(
        home: CameraScreen(),
      );
    }
  }
  
  class CameraScreen extends StatefulWidget {
    @override
    _CameraScreenState createState() => _CameraScreenState();
  }
  
  class _CameraScreenState extends State<CameraScreen> {
    final ImagePicker _picker = ImagePicker();
  
    void _openCamera() async {
      final pickedFile = await _picker.pickImage(source: ImageSource.camera);
      if (pickedFile != null) {
        // Handle the captured image
        print('Image Path: ${pickedFile.path}');
      }
    }
  
    @override
    Widget build(BuildContext context) {
      return Scaffold(
        appBar: AppBar(
          title: Text('Open Camera Example'),
        ),
        body: Center(
          child: Elev

# Language Server

In [26]:
import asyncio
import json
import subprocess
from pygls.client import JsonRPCClient

class PyrightClient(JsonRPCClient):
    def __init__(self):
        super().__init__()
        self.server_process = None

    async def start_server(self):
        self.server_process = subprocess.Popen(
            ['pyright-langserver', '--stdio'],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        await self.connect(
            self.server_process.stdout,
            self.server_process.stdin
        )

    async def initialize(self):
        initialize_params = {
            'processId': None,
            'rootUri': None,
            'capabilities': {},
            'trace': 'off',
            'workspaceFolders': None
        }
        response = await self.send_request('initialize', initialize_params)
        await self.send_notification('initialized', {})
        return response

    async def get_completion(self, text, line, character):
        completion_params = {
            'textDocument': {'uri': 'file:///dummy.py'},
            'position': {'line': line, 'character': character},
        }
        response = await self.send_request('textDocument/completion', completion_params)
        return response

    async def shutdown(self):
        await self.send_request('shutdown', {})
        await self.send_notification('exit', {})

async def main():
    client = PyrightClient()
    await client.start_server()
    await client.initialize()

    code = "import os\nos."
    lines = code.split('\n')
    line = len(lines) - 1
    character = len(lines[-1])

    completions = await client.get_completion(code, line, character)
    print(json.dumps(completions, indent=2))

    await client.shutdown()


In [30]:
asyncio.run(main())

RuntimeError: asyncio.run() cannot be called from a running event loop