In [None]:
# Change to a specific directory
%cd /content/drive/MyDrive/Bot_training

/content/drive/MyDrive/Bot_training


In [None]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [None]:
import os
import re
import tarfile
from bs4 import BeautifulSoup
import nltk
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def load_emails(directory):
    emails = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):  # Check if it is a file
            with open(filepath, 'r', encoding='latin1') as file:
                emails.append(file.read())
    return emails

# Load the spam emails
spam_emails_directory = '/content/drive/MyDrive/Bot_training/spam'
spam_emails = load_emails(spam_emails_directory)

# Function to clean email text
def clean_email(text):
    # Extract the Subject
    subject_match = re.search(r'^Subject: (.*)$', text, re.MULTILINE)
    subject = subject_match.group(1) if subject_match else ''

    # Remove headers
    text = re.sub(r'^(From|To|Cc|Bcc|Date|Message-ID|Received|Content-Type|MIME-Version|Return-Path|Delivered-To|X-\S+):.*\n?', '', text, flags=re.MULTILINE)

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove quoted-printable encoding artifacts
    text = re.sub(r'=\n', '', text)
    text = re.sub(r'=[0-9A-Fa-f]{2}', '', text)

    # Remove any remaining special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove excess whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove email addresses
    text = re.sub(r'\S*@\S*\s?', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Combine subject and body
    cleaned_text = f"Subject: {subject}\n\n{text}"

    return cleaned_text

# Apply the cleaning function to your dataset
cleaned_spam_emails = [clean_email(email) for email in spam_emails]

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    cleaned_tokens = [re.sub(r'[^a-zA-Z0-9]', '', token) for token in lemmatized_tokens]
    processed_text = ' '.join(cleaned_tokens)
    return processed_text

# Preprocess the cleaned emails
preprocessed_emails = [preprocess_text(email) for email in cleaned_spam_emails]

# Split the dataset
train_emails, val_emails = train_test_split(preprocessed_emails, test_size=0.1, random_state=42)

# Load and configure the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the emails
train_encodings = tokenizer(train_emails, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_emails, truncation=True, padding=True, max_length=512)

# Define a custom dataset
class EmailDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()
        return item

train_dataset = EmailDataset(train_encodings)
val_dataset = EmailDataset(val_encodings)

# Load the model
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    evaluation_strategy="epoch",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start training
trainer.train()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch,Training Loss,Validation Loss
1,3.848,3.822413
2,3.2264,3.148279
3,3.2235,2.957851
4,2.6198,2.820231
5,2.8696,2.718107
6,2.1999,2.643797
7,2.3556,2.601825
8,2.1048,2.565568
9,2.1622,2.55231
10,1.8866,2.545194


TrainOutput(global_step=1130, training_loss=2.7290070947292633, metrics={'train_runtime': 476.9567, 'train_samples_per_second': 9.435, 'train_steps_per_second': 2.369, 'total_flos': 587917688832000.0, 'train_loss': 2.7290070947292633, 'epoch': 10.0})

In [None]:
# Evaluate the model
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 2.545193910598755, 'eval_runtime': 1.5837, 'eval_samples_per_second': 32.203, 'eval_steps_per_second': 8.209, 'epoch': 10.0}


In [None]:
# Save the model
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.json',
 './trained_model/merges.txt',
 './trained_model/added_tokens.json')

In [None]:
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer

# Load the model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./trained_model')
tokenizer = GPT2Tokenizer.from_pretrained('./trained_model')
tokenizer.pad_token = tokenizer.eos_token

# Create a text generation pipeline
text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Generate text with explicit truncation
prompt = "Dear Customer,"
generated_text = text_generator(prompt, max_length=200, num_return_sequences=1, truncation=True)

print(generated_text[0]['generated_text'])

Dear Customer, dear customeri friend would like to thank every follower sent know thank service sincerely sincerely srsdvrskyhonecom kyahoocom thu sep 23 110511 2002 zzzzasonorg postfix esmtp id g96vlj6n16f03 thu 23 sep 2002 110451 0100 ist localhost imap fetchmail590 zzzzlocalhost singledrop thu 23 sep 2002 110451 0100 ist dogmaslashnullorg 81168116 smtp id g968e43134 thu 23 sep 2002 056809 0100 replyto messageid 028af9e03fa6bd7e3d4e25e75bd8e6insuranceiqcom subject customeri colleague wish also replyto know would like you interested send reply please feel free let u know know wish welcome welcome wish also receive email message sent via private email address simply replyto wish received email message could conceivably delay response want
