In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import torch
import numpy as np
import os

separator = '\\' if os.name == 'nt' else '/'

# Load the CSV file
csv_file_path1 = f'data{separator}permissions.csv'
df = pd.read_csv(csv_file_path1)

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(f'result{separator}testone', local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(f'result{separator}testone', local_files_only=True)

if torch.cuda.is_available():
    print("CUDA!!!")
    device = 'cuda'
    model.to(device)
else:
    print("CPU!!")
    device = 'cpu'

# Define preprocessing function
def preprocess_function(df):
    texts = [f"Username: {row['Username']}, Access Control: {row['AccessControlType']}, Rights: {row['FileSystemRights']}, Inherited: {row['IsInherited']}" for _, row in df.iterrows()]
    encodings = tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt", max_length=1024)
    encodings['labels'] = encodings['input_ids'].clone()
    return encodings

# Apply preprocessing
tokenized_data = preprocess_function(df)

# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({
    'input_ids': tokenized_data['input_ids'].tolist(),
    'attention_mask': tokenized_data['attention_mask'].tolist(),
    'labels': tokenized_data['labels'].tolist()
})

# Split the dataset into training and validation sets
dataset_dict = dataset.train_test_split(test_size=0.1)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="result",
    save_strategy="no",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,  # Number of epochs per training cycle
    logging_dir="log",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['test'],
)

# Define your performance threshold
def meets_performance_criteria(metrics):
    # Example criterion: validation loss should be below a threshold
    return metrics['eval_loss'] < 1.0

# Training loop
best_metrics = {'eval_loss': np.inf}
max_attempts = 10
attempt = 0

while attempt < max_attempts:
    print(f"Training attempt {attempt + 1}/{max_attempts}")
    trainer.train()
    
    # Evaluate the model
    eval_results = trainer.evaluate()
    
    # Check performance
    if meets_performance_criteria(eval_results):
        print("Performance criteria met!")
        break
    
    # Update best metrics
    if eval_results['eval_loss'] < best_metrics['eval_loss']:
        best_metrics = eval_results
        print(f"New best metrics: {best_metrics}")
    
    attempt += 1

print("Training completed.")

In [None]:
model.save_pretrained(f'result{separator}testone')
tokenizer.save_pretrained(f'result{separator}testone')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(f'result{separator}testone', local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(f'result{separator}testone', local_files_only=True)

In [None]:
# Use the model and tokenizer for inference
input_text = "Example input text for the model"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Model Output: {decoded_output}")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(f'result{separator}testone', local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(f'result{separator}testone', local_files_only=True)

# Define a function to generate a response
def query_model(prompt, max_length=2048):
    prompt = prompt.replace("'", " ' ").replace(".", " . ")
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,  # Adjust max_length
        num_return_sequences=1,
        temperature=1.2,  # Adjust temperature
        top_k=100,  # Top-K sampling
        top_p=0.9,  # Top-P (nucleus) sampling
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2,  # Adjust penalty value as needed
        eos_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example usage
prompt = "Give me a detailed report of all accessible information for the user with the identifier bhardenmn"
response = query_model(prompt)
print("Prompt:", prompt)
print("Model Output:", response)