In [3]:
import os
import json
import pandas as pd


def load_emails(folder, label):
    data = []
    for filename in os.listdir(folder):
        if filename.endswith('.json'):
            with open(os.path.join(folder, filename), 'r') as f:
                email = json.load(f)
                text = email.get('subject', '') + ' ' + email.get('text_body', '')
                data.append({'text': text, 'label': label})
    return data

# Load emails that need a reply
replied_emails = load_emails('data_mail1/replied/', 1)
replied_emails

# Load emails that don't need a reply
unreplied_emails = load_emails('data_mail1/unreplied/', 0)
unreplied_emails

# Load emails that need a reply
replied_emails_2 = load_emails('data_mail2/replied/', 1)
replied_emails

# Load emails that don't need a reply
unreplied_emails_2 = load_emails('data_mail2/unreplied/', 0)
unreplied_emails

# Combine and create a DataFrame
all_emails = replied_emails + unreplied_emails + replied_emails_2 + unreplied_emails_2
df = pd.DataFrame(all_emails)
df.head()


Unnamed: 0,text,label
0,"Update Dear Dias,\r\n\r\nHow are you? Are you ...",1
1,\r\n,1
2,Re: Enquiry #959 - Dias Irishev - 12/02/2011 1...,1
3,"Shipping of glasses Dear Dias,\r\n\r\n \r\n\r\...",1
4,\r\n,1


In [5]:

import re
from bs4 import BeautifulSoup
import unicodedata


def clean_email(text):
    # Parse and remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    # Normalize Unicode characters to remove unwanted symbols
    text = unicodedata.normalize('NFKD', text)

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove special characters, including emojis, and HTML artifacts
    text = re.sub(r'[🔥*]', '', text)  # Remove specific special characters
    text = re.sub(r'\[del:.*?:del\]', '', text)  # Remove text between [DEL: ... :DEL]
    text = re.sub(r'\[.*?\]', '', text)  # Remove other square-bracketed items (e.g., [1], [2])

    # Remove escape characters and excessive whitespace
    text = re.sub(r'\r\n|\n|\t', ' ', text)  # Replace escape characters with space

    # Remove mentions of 'image' or placeholders
    text = re.sub(r'\bimage\b', '', text)

    # Remove non-Cyrillic, non-alphanumeric characters except basic punctuation
    text = re.sub(r'[^а-яА-Яa-zA-Z0-9,.!?@+ ]+', '', text)

    # Remove redundant spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


# Example usage
df['text'] = df['text'].apply(clean_email)
df.head()

  soup = BeautifulSoup(text, "html.parser")


Unnamed: 0,text,label
0,"update dear dias, how are you? are you ready f...",1
1,,1
2,re enquiry 959 dias irishev 12022011 1720 hi d...,1
3,"shipping of glasses dear dias, i hope you are ...",1
4,,1


In [44]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

X_train.shape, y_train.shape

((18612,), (18612,))

In [45]:
X_train.iloc[1], y_train.iloc[1]

('michael webbs secrets of blissful relationships no! x o x o x o x o x o x o secrets of blissful relationships from michael webb, the webs 1 relationship author and expert x o x o x o x o x o x o no! by michael webb no was once a very popular word in our house when we had toddlers. we had little problem telling our son and daugher no to things that are potentially harmful. thats what loving parents do. however, many of us are unable to utter that twoletter word to our spouse. we dont want to upset the apple cart. we dont want to go to that concert tuesday night but if we say no, that will hurt s feelings. we rationalize that if we are always saying yes to our spouses ideas and desires, then our relationship will be peaceful. in loving relationships there should be plenty of nos. being able to say no means you have the courage in what you believe and you have the faith that your honest communication will make your relationship stronger in the long run. if you dont say no when you reall

# DATA TRAIN

In [46]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score,
    average_precision_score
)
import matplotlib.pyplot as plt
import seaborn as sns

In [47]:
# Example: Ensure X_train and X_test are lists of strings
X_train = X_train.tolist() if isinstance(X_train, pd.Series) else X_train
X_test = X_test.tolist() if isinstance(X_test, pd.Series) else X_test

In [72]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels.iloc[idx])
        # Rest of your code remains the same

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),  # Shape: (max_length,)
            'attention_mask': encoding['attention_mask'].flatten(),  # Shape: (max_length,)
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [73]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
BATCH_SIZE = 8  # Adjust based on your hardware capabilities

train_dataset = TextDataset(X_train, y_train, tokenizer)
test_dataset = TextDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [75]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights = torch.tensor(class_weights, dtype=torch.float)

In [76]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')
model.to(device)
class_weights = class_weights.to(device)

In [77]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

# Define loss function with class weights
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)



In [78]:
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()
    total_loss = 0

    for batch in tqdm(data_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = loss_fn(outputs.logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

In [79]:
def eval_model(model, data_loader, device):
    model = model.eval()
    y_preds = []
    y_true = []
    y_scores = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            _, preds = torch.max(logits, dim=1)

            y_preds.extend(preds.cpu().numpy())
            y_true.extend(labels.cpu().numpy())
            y_scores.extend(probabilities[:, 1].cpu().numpy())

    return y_true, y_preds, y_scores

In [80]:
from tqdm import tqdm

EPOCHS = 1  # Adjust based on performance

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device)
    print(f'Train loss: {train_loss:.4f}')

    y_true, y_preds, y_scores = eval_model(model, test_loader, device)
    recall = recall_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds, zero_division=0)
    f1 = f1_score(y_true, y_preds, zero_division=0)
    accuracy = (np.array(y_preds) == np.array(y_true)).mean()
    roc_auc = roc_auc_score(y_true, y_scores)
    pr_auc = average_precision_score(y_true, y_scores)

    print(f'Validation Recall: {recall:.4f}, Precision: {precision:.4f}, F1-Score: {f1:.4f}, Accuracy: {accuracy:.4f}')
    print(f'Validation ROC AUC: {roc_auc:.4f}, PR AUC: {pr_auc:.4f}')

Epoch 1/1


100%|██████████| 2327/2327 [21:29<00:00,  1.80it/s]


Train loss: 0.2981
Validation Recall: 0.5600, Precision: 0.3011, F1-Score: 0.3916, Accuracy: 0.9626
Validation ROC AUC: 0.9036, PR AUC: 0.3080


In [71]:
len(X_train)/8

2326.5