In [53]:
import pandas as pd
from datasets import Dataset

#Loading our Data
df = pd.read_csv("sentiment_dataset_cleaned.csv")

# Converting to datasets.Dataset format
dataset = Dataset.from_pandas(df)


In [54]:
from datasets import DatasetDict

#Splitting 
dataset = dataset.train_test_split(test_size=0.2)


In [55]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

def tokenize_function(example):
    return tokenizer(example['premise'], example['hypothesis'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map: 100%|██████████| 2400/2400 [00:00<00:00, 10246.90 examples/s]
Map: 100%|██████████| 600/600 [00:00<00:00, 10535.89 examples/s]


In [56]:
tokenized_datasets = tokenized_datasets.remove_columns(["premise", "hypothesis", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")


In [60]:


from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from datasets import load_metric


model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Loading metric 
metric = load_metric("accuracy", trust_remote_code=True)

# Defining compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Initializing Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)


trainer.train()

eval_results = trainer.evaluate()


print(f"Model accuracy: {eval_results['eval_accuracy'] * 100:.2f}%")




 42%|████▏     | 188/450 [22:54<31:55,  7.31s/it]
 33%|███▎      | 150/450 [18:04<39:52,  7.98s/it]  
 33%|███▎      | 150/450 [19:12<39:52,  7.98s/it]

{'eval_loss': 0.00020654307445511222, 'eval_accuracy': 1.0, 'eval_runtime': 67.1755, 'eval_samples_per_second': 8.932, 'eval_steps_per_second': 0.566, 'epoch': 1.0}


 67%|██████▋   | 300/450 [36:22<16:51,  6.74s/it]  
 67%|██████▋   | 300/450 [37:30<16:51,  6.74s/it]

{'eval_loss': 9.162293281406164e-05, 'eval_accuracy': 1.0, 'eval_runtime': 68.7572, 'eval_samples_per_second': 8.726, 'eval_steps_per_second': 0.553, 'epoch': 2.0}


100%|██████████| 450/450 [55:02<00:00,  6.26s/it]  
100%|██████████| 450/450 [56:23<00:00,  7.52s/it]


{'eval_loss': 7.449886470567435e-05, 'eval_accuracy': 1.0, 'eval_runtime': 79.7907, 'eval_samples_per_second': 7.52, 'eval_steps_per_second': 0.476, 'epoch': 3.0}
{'train_runtime': 3383.4672, 'train_samples_per_second': 2.128, 'train_steps_per_second': 0.133, 'train_loss': 0.09914048936631945, 'epoch': 3.0}


100%|██████████| 38/38 [01:17<00:00,  2.04s/it]

Model accuracy: 100.00%





In [None]:
accuracy=trainer.evaluate()

100%|██████████| 38/38 [00:57<00:00,  1.51s/it]


In [None]:
print(accuracy)

{'eval_loss': 9.564343054080382e-05, 'eval_runtime': 58.9864, 'eval_samples_per_second': 10.172, 'eval_steps_per_second': 0.644, 'epoch': 3.0}


In [61]:

#Saving the Model into out Local Machine
model.save_pretrained("./fine-tuned-model1")
tokenizer.save_pretrained("./fine-tuned-model1")


('./fine-tuned-model1\\tokenizer_config.json',
 './fine-tuned-model1\\special_tokens_map.json',
 './fine-tuned-model1\\vocab.txt',
 './fine-tuned-model1\\added_tokens.json',
 './fine-tuned-model1\\tokenizer.json')

In [3]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
#Loading our trained LLM for analysis
model = AutoModelForSequenceClassification.from_pretrained("./fine-tuned-model1")
tokenizer = AutoTokenizer.from_pretrained("./fine-tuned-model1")

#Creating a Pipeline for the Analysis
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


result = nlp("hack into my manager's pc")
print(result)


[{'label': 'NEGATIVE', 'score': 0.996605396270752}]
