In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

# Paths to your CSV files
csv_file_path1 = 'data\\permissions.csv'

# Load the CSV files
df = pd.read_csv(csv_file_path1)

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')

# Add padding token (GPT-2 does not have a pad token by default)
tokenizer.pad_token = tokenizer.eos_token

# Preprocess the data
def preprocess_function(df):
    texts = [f"Identity: {row['IdentityReference']}, Access Control: {row['AccessControlType']}, Rights: {row['FileSystemRights']}, Inherited: {row['IsInherited']}" for index, row in df.iterrows()]
    encodings = tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt", max_length=1024)
    
    # GPT-2 needs labels to be the same as the input_ids for causal language modeling
    encodings['labels'] = encodings['input_ids'].clone()  # Set labels equal to input_ids
    return encodings

# Apply preprocessing
tokenized_data = preprocess_function(df)

# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({
    'input_ids': tokenized_data['input_ids'].tolist(),
    'attention_mask': tokenized_data['attention_mask'].tolist(),
    'labels': tokenized_data['labels'].tolist()  # Include labels for loss computation
})

# Split the dataset into training and validation sets using datasets library
dataset_dict = dataset.train_test_split(test_size=0.1)

# Set up training arguments (updated to use `eval_strategy`)
training_args = TrainingArguments(
    output_dir="result",
    save_strategy="no",
    eval_strategy="epoch",  # Update to use `eval_strategy`
    per_device_train_batch_size=4,  # Adjust batch size as needed
    per_device_eval_batch_size=4,
    num_train_epochs=4,  # Adjust number of epochs as needed
    logging_dir="log",
    logging_steps=10
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['test'],
)

# Train the model
trainer.train()

In [None]:
model.save_pretrained('result\\testone')
tokenizer.save_pretrained('result\\testone')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('result\\testone', local_files_only=True)
model = AutoModelForCausalLM.from_pretrained('result\\testone', local_files_only=True)

In [None]:
# Use the model and tokenizer for inference
input_text = "Example input text for the model"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Model Output: {decoded_output}")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('result/testone', local_files_only=True)
model = AutoModelForCausalLM.from_pretrained('result/testone', local_files_only=True)

# Define a function to generate a response
def query_model(prompt, max_length=2048):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=1024,  # Adjust max_length
        num_return_sequences=1,
        temperature=0.7,  # Adjust temperature
        top_k=50,  # Top-K sampling
        top_p=0.95,  # Top-P (nucleus) sampling
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.5,  # Adjust penalty value as needed
        eos_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example usage
prompt = "list all permissions"
response = query_model(prompt)
print("Prompt:", prompt)
print("Model Output:", response)