In [1]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import time
from pathlib import Path

# Function to load and preprocess data
def load_data(gsheet_id, sheet_name):
    """Load data from Google Sheets and preprocess it"""
    gsheet_url = f"https://docs.google.com/spreadsheets/d/{gsheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
    try:
        df = pd.read_csv(gsheet_url)
        df = df.iloc[:, 0:3]  # Keep only first 3 columns
        print(df)
        
        # Clean text content
        df['content'] = (df['content'].str.lower()
                        .str.replace('\r\n', ' ', regex=False)
                        .str.replace('\t', ' ', regex=False)
                        .str.replace('\n', ' ', regex=False)
                        .str.replace('       ', ' ', regex=False)
                        .str.strip())
        print(df)
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Custom Dataset class for PyTorch
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=256):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encodings = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return encodings.input_ids.squeeze(), encodings.attention_mask.squeeze()


def test_generation(model, tokenizer, prompts):
    """Test model generation with given prompts"""
    print("\nTesting generation...")

    # Ensure model is on the correct device (GPU in this case)
    device = model.device
    
    # Set or add padding token if not already set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Or add '[PAD]' as pad token

    for prompt in prompts:
        print(f"\nPrompt: {prompt}")
        inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
        attention_mask = inputs['attention_mask']
        input_ids = inputs['input_ids']
        
        # Move input_ids and attention_mask to the same device as the model
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        # Generate the output
        outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=200, pad_token_id=tokenizer.pad_token_id)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Generated text: {generated_text}")




def create_model():
    """Load GPT-2 model and tokenizer from Hugging Face"""
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    return model, tokenizer

def prepare_dataset(texts, tokenizer, batch_size=8):
    """Prepare dataset for training"""
    dataset = TextDataset(texts, tokenizer)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

def save_model(model, tokenizer, base_path="./classicmalaymodel"):
    """Save model weights and configuration"""
    try:
        # Create directory if it doesn't exist
        Path(base_path).mkdir(parents=True, exist_ok=True)
        
        # Save the model and tokenizer
        model.save_pretrained(base_path)
        tokenizer.save_pretrained(base_path)
        
        print(f"Model and tokenizer saved successfully to {base_path}")
    except Exception as e:
        print(f"Error saving model: {e}")

def main():
    # Configuration
    GSHEET_ID = "191YRBsdUEGtgXvWl428L9xt48Ah5zcyoRK1pvNhUDPM"
    SHEET_NAME = "Sheet1"
    NUM_EPOCHS = 15
    BATCH_SIZE = 8
    LEARNING_RATE = 3e-4
    
    # Test prompts
    test_prompts_base = [
        "My trip to Kuala Lumpur was so annoying"
    ]
    test_prompts_malay = [
        "Raja dan puterinya melangkah"
    ]
    
    # Create model and tokenizer
    print("Creating model and tokenizer...")
    model, tokenizer = create_model()
    
    # Test the base model (before fine-tuning)
    print("\nTesting base model (before fine-tuning):")
    test_generation(model, tokenizer, test_prompts_base)
    
    # Load and preprocess data
    print("\nLoading data for fine-tuning...")
    df = load_data(GSHEET_ID, SHEET_NAME)
    if df is None:
        return
    
    # Prepare dataset
    print("Preparing dataset...")
    train_dl = prepare_dataset(df['content'], tokenizer, batch_size=BATCH_SIZE)
    
    # Prepare optimizer and scheduler
    from torch.optim import AdamW
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

    total_steps = len(train_dl) * NUM_EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    
    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Fine-tune model
    print("Starting fine-tuning...")
    start_time = time.time()
    model.train()
    for epoch in range(NUM_EPOCHS):
        for batch in train_dl:
            input_ids, attention_mask = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
        
        print(f"Epoch {epoch+1}/{NUM_EPOCHS} completed with loss: {loss.item()}")
    
    end_time = time.time()
    print(f"Training completed in {end_time - start_time:.2f} seconds")
    
    # Test fine-tuned model
    print("\nTesting fine-tuned model:")
    test_generation(model, tokenizer, test_prompts_malay)
    
    # Save model
    print("\nSaving fine-tuned model...")
    save_model(model, tokenizer)

if __name__ == "__main__":
    main()


Creating model and tokenizer...

Testing base model (before fine-tuning):

Testing generation...

Prompt: My trip to Kuala Lumpur was so annoying
Generated text: My trip to Kuala Lumpur was so annoying, I had to go to the airport to get my passport. I was told that I had to go to the airport to get my passport. I was told that I had to go to the airport to get my passport. I was told that I had to go to the airport to get my passport. I was told that I had to go to the airport to get my passport. I was told that I had to go to the airport to get my passport. I was told that I had to go to the airport to get my passport. I was told that I had to go to the airport to get my passport. I was told that I had to go to the airport to get my passport. I was told that I had to go to the airport to get my passport. I was told that I had to go to the airport to get my passport. I was told that I had to go to the airport to get my passport. I was told

Loading data for fine-tuning...
      subhead

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Define the directory where the model and tokenizer were saved
saved_model_dir = "./classicmalaymodel"  # Or wherever you specified when saving

# Load the model and tokenizer
model = GPT2LMHeadModel.from_pretrained(saved_model_dir)
tokenizer = GPT2Tokenizer.from_pretrained(saved_model_dir)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set the model to evaluation mode
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
def generate_text(model, tokenizer, prompt, max_length=100):
    # Encode the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Generate output
    outputs = model.generate(inputs["input_ids"], max_length=max_length, pad_token_id=tokenizer.pad_token_id)
    
    # Decode the generated tokens into text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Example prompt
prompt = "air laut"
generated_text = generate_text(model, tokenizer, prompt)
print("Generated text:", generated_text)


Generated text: air lautan kembali ke seluruh penduduk perkampungan gapura persekitaran. persekitaran penduduk perkampungan gapura persekitaran tersebut.  sekali-sekala, sekali-sekala, sekali-sekala, sekali-sekala, sekala-sekala, sekala-sekala, se
