### ***Install transformer***

In [None]:
# %pip install transformers
# %pip install torch

In [2]:
import subprocess
import re
import numpy as np
import pandas as pd
from transformers import TFGPT2LMHeadModel, GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.nn.utils.rnn import pad_sequence


from torch.utils.tensorboard import SummaryWriter

log_dir = "./logs"  # Specify the directory where you want to save the logs
writer = SummaryWriter(log_dir)



### ***file path***

In [3]:
dataset_path = 'dataset/'
yoda_file = 'yoda-corpus.csv'
model_path = 'model/trained_model1'
# path = '/content/drive/My Drive/Colab Notebooks/dataset/'

In [4]:
df = pd.read_csv(dataset_path+yoda_file)
yoda_dialouges=df.loc[df['character']== 'YODA','text']
dialouge_array = np.array(yoda_dialouges)
dialouge_array = dialouge_array[:5]
len(dialouge_array)

5

In [5]:
print(dialouge_array)

['The very Republic is threatened, if involved the Sith are.'
 'Hard to see, the dark side is. Discover who this assassin is, we must.'
 'With this Naboo queen you must stay, Qui-Gon. Protect her.'
 'May the Force be with you.'
 "(Cont'd) Master Qui-Gon more to say have you?"]


In [6]:
def train_model():
        # Load the pre-trained GPT2 model and tokenizer
        # model = GPT2LMHeadModel.from_pretrained("gpt2")
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        model = GPT2LMHeadModel.from_pretrained("gpt2",pad_token_id=tokenizer.eos_token_id)

        # Create a generation configuration based on the model's configuration
        # generation_config = Gpt2GenerationConfig.from_pretrained("gpt2")
        # generation_config.pad_token_id = generation_config.eos_token_id

        # Define your input data
        # input_texts = [
        #     "Hello!",
        #     "How are you?",
        #     "What is your name?",
        #     "Tell me a joke.",
        #     "Goodbye!",
        #     "Roshan is good boy",
        #     "i hope you will get well soon"
        # ]
        
        df = pd.read_csv('dataset/TweetsElonMusk.csv')

        # Filter tweets by language (English)
        df = df[df['language'] == 'en']

        # Remove words starting with "@"
        cleaned_dialogues = []

        for tweet in df['tweet']:
            dialogue = re.sub(r'@\w+\s?', '', tweet)
            cleaned_dialogues.append(dialogue)

        # Tokenize the input texts
        tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in cleaned_dialogues[:100]]
        # tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in dialouge_array ]

        # Set the maximum sequence length
        max_seq_length = 32

        # Truncate or split the input sequences if they exceed the maximum sequence length
        input_ids = []
        attention_masks = []
        for tokens in tokenized_texts:
            if len(tokens) > max_seq_length:
                tokens = tokens[:max_seq_length]
            input_ids.append(tokens)
            attention_masks.append([1] * len(tokens))

        # Pad the sequences within a batch
        input_ids = pad_sequence([torch.tensor(ids) for ids in input_ids], batch_first=True)
        attention_masks = pad_sequence([torch.tensor(ids) for ids in attention_masks], batch_first=True)

        # Define the batch size
        batch_size = 3

        # Create batches of the input sequences
        batches = []
        for i in range(0, len(input_ids), batch_size):
            batch_input_ids = input_ids[i:i+batch_size]
            batch_attention_masks = attention_masks[i:i+batch_size]
            batch = {"input_ids": batch_input_ids, "attention_mask": batch_attention_masks}
            batches.append(batch)

        # Set the model to training mode
        model.train()

        # Define the optimizer and learning rate
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

        # Start TensorBoard using subprocess
        tensorboard_process = subprocess.Popen(["tensorboard", "--logdir", log_dir])

        # Training loop
        epochs = 3
        for epoch in range(epochs):
            # Shuffle the batches
            shuffled_batches = torch.randperm(len(batches))

            # Iterate over the shuffled batches
            for batch_index in shuffled_batches:
                batch = batches[batch_index]

                # Clear gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["input_ids"])
                loss = outputs.loss

                # Backward pass and optimization
                loss.backward()
                optimizer.step()

                # Print training progress
                print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")
                # Write loss to TensorBoard
                writer.add_scalar("Training Loss", loss.item(), epoch)
        writer.close()
        print(f"tensorboard --logdir={log_dir}")

        # Save the trained model and tokenizer
        # model.save_pretrained("trained_model")
        # tokenizer.save_pretrained("trained_model")
        # generation_config.save_pretrained("trained_model")
        global modal
        global tokenizar
        modal= model
        tokenizar=tokenizer
        # model.save_pretrained('/content/drive/My Drive/Colab Notebooks/model/trained_model')
        # tokenizer.save_pretrained('/content/drive/My Drive/Colab Notebooks/model/trained_model')
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
        # generation_config.save_pretrained('/content/drive/My Drive/Colab Notebooks/model2/trained_model')


In [7]:
train_model()

Epoch [1/3], Loss: 6.6957
Epoch [1/3], Loss: 6.8936
Epoch [1/3], Loss: 6.4712
Epoch [1/3], Loss: 5.9778
Epoch [1/3], Loss: 5.8733
Epoch [1/3], Loss: 5.3025
Epoch [1/3], Loss: 5.5032
Epoch [1/3], Loss: 5.6692
Epoch [1/3], Loss: 5.2720
Epoch [1/3], Loss: 4.3009
Epoch [1/3], Loss: 4.9333
Epoch [1/3], Loss: 4.4548
Epoch [1/3], Loss: 4.6386
Epoch [1/3], Loss: 3.1584
Epoch [1/3], Loss: 4.8949
Epoch [1/3], Loss: 4.3700
Epoch [1/3], Loss: 4.0593
Epoch [1/3], Loss: 4.0758
Epoch [1/3], Loss: 4.0405
Epoch [1/3], Loss: 3.7532
Epoch [1/3], Loss: 3.6380
Epoch [1/3], Loss: 2.6116
Epoch [1/3], Loss: 2.5109
Epoch [1/3], Loss: 3.0051
Epoch [1/3], Loss: 1.9036
Epoch [1/3], Loss: 4.1905
Epoch [1/3], Loss: 3.3620
Epoch [1/3], Loss: 3.8097
Epoch [1/3], Loss: 3.0238
Epoch [1/3], Loss: 3.3307
Epoch [1/3], Loss: 3.3281
Epoch [1/3], Loss: 3.2004
Epoch [1/3], Loss: 3.5553
Epoch [1/3], Loss: 2.5285
Epoch [2/3], Loss: 2.7659
Epoch [2/3], Loss: 0.9612
Epoch [2/3], Loss: 3.1471
Epoch [2/3], Loss: 2.8837
Epoch [2/3],