In [5]:
import os
import json
import pandas as pd


def load_emails(folder, label):
    data = []
    for filename in os.listdir(folder):
        if filename.endswith('.json'):
            with open(os.path.join(folder, filename), 'r') as f:
                email = json.load(f)
                text = email.get('subject', '') + ' ' + email.get('text_body', '')
                data.append({'text': text, 'label': label})
    return data

In [6]:
# Load emails that need a reply
replied_emails = load_emails('data/data_mail1/replied/', 1)
replied_emails

[{'text': 'Update Dear Dias,\r\n\r\nHow are you? Are you ready for departure yet? When do you plan to arrive in the Netherlands? Did you manage to get a room? And how about the minor, is everything arranged?\r\n\r\nI would like to receive an update from you. In case you have any questions, or if anything is unclear to you, please let me know.\r\n\r\nMet vriendelijke groet / Best regards,\r\n\r\nLeonie van Winden MSc\r\nStudieadviseur / Study adviser ESE\r\n\r\nErasmus School of Economics\r\n\r\nT    +31 10 408 1380\r\nE     studyadviser@ese.eur.nl<mailto:studyadviser@ese.eur.nl>\r\nW   www.eur.nl/ese/studieadviseur<http://www.eur.nl/ese/studieadviseur>\r\nW   www.eur.nl/ese/study_adviser<http://www.eur.nl/ese/study_adviser/>\r\n\r\nVisiting address\r\nEducation Service Centre\r\nTinbergen Building, office H6-02\r\nBurgemeester Oudlaan 50\r\n3062 PA  Rotterdam\r\n\r\nPostal address\r\nP.O. Box 1738\r\n3000 DR  Rotterdam\r\nThe Netherlands\r\n\r\n[facebook-48]<http://www.facebook.com/era

In [8]:
# Load emails that don't need a reply
unreplied_emails = load_emails('data/data_mail1/unreplied/', 0)
unreplied_emails

[{'text': '[reddit] verify your email address \r\n\r\n\r\nyour username is:\r\n\r\n    Irishsheff\r\n\r\nvisit this link to verify your email address:\r\n\r\n    https://www.reddit.com/verification/Y4ciyLodoQLYM9x6NrjmirEvK_A?ref_campaign=verify_email&ref_source=email&ref=verify_email\r\n\r\nthanks for using the site!\r\n',
  'label': 0},
  'label': 0},
 {'text': 'Stukken van pure kasjmier om van te houden ', 'label': 0},
 {'text': 'Booking Availability Request (1356) ', 'label': 0},
 {'text': '👏 Скидка до 60% на курсы Skillbox ', 'label': 0},
 {'text': 'Message From Almatytelecom "Уважаемый (ая) ИВАНОВА ИРИНА МИХАЙЛОВНА, Вам выставлен счет № 394194491 от 30.06.2024.\r\nОплату можно произвести любым удобным для вас способом: \r\n- онлайн на www.telecom.kz (без комиссии и регистрации).\r\n- посредством инфокиосков и банкоматов\r\n- а также через кассы банков и Казпочты.\r\n  С уважением, Ваш Казахтелеком!"\r\n',
  'label': 0},
 {'text': "Confirm your email address \r\n\r\nKeep your acco

In [9]:
# Combine and create a DataFrame
all_emails = replied_emails + unreplied_emails
df = pd.DataFrame(all_emails)
df.head()

Unnamed: 0,text,label
0,"Update Dear Dias,\r\n\r\nHow are you? Are you ...",1
1,\r\n,1
2,Re: Enquiry #959 - Dias Irishev - 12/02/2011 1...,1
3,"Shipping of glasses Dear Dias,\r\n\r\n \r\n\r\...",1
4,\r\n,1


In [10]:
import re
from bs4 import BeautifulSoup
import unicodedata


def clean_email(text):
    # Parse and remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    # Normalize Unicode characters to remove unwanted symbols
    text = unicodedata.normalize('NFKD', text)

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove special characters, including emojis, and HTML artifacts
    text = re.sub(r'[🔥*]', '', text)  # Remove specific special characters
    text = re.sub(r'\[del:.*?:del\]', '', text)  # Remove text between [DEL: ... :DEL]
    text = re.sub(r'\[.*?\]', '', text)  # Remove other square-bracketed items (e.g., [1], [2])

    # Remove escape characters and excessive whitespace
    text = re.sub(r'\r\n|\n|\t', ' ', text)  # Replace escape characters with space

    # Remove mentions of 'image' or placeholders
    text = re.sub(r'\bimage\b', '', text)

    # Remove non-Cyrillic, non-alphanumeric characters except basic punctuation
    text = re.sub(r'[^а-яА-Яa-zA-Z0-9,.!?@+ ]+', '', text)

    # Remove redundant spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


# Example usage
df['text'] = df['text'].apply(clean_email)
df.head()

  soup = BeautifulSoup(text, "html.parser")


Unnamed: 0,text,label
0,"update dear dias, how are you? are you ready f...",1
1,,1
2,re enquiry 959 dias irishev 12022011 1720 hi d...,1
3,"shipping of glasses dear dias, i hope you are ...",1
4,,1


In [11]:
from sentence_transformers import SentenceTransformer

# Generate embeddings
# Load the SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
embeddings = model.encode(
    df['text'].tolist(),
    convert_to_numpy=True
)

  from tqdm.autonotebook import tqdm, trange


In [12]:
print(embeddings.shape)  # should be (number of texts, embedding dimension)

(21861, 384)


In [13]:
import numpy as np
np.save('data/email_embedding.npy', embeddings)

# Traning the classifier

In [13]:
# import numpy as np

# # Load the saved array
# embeddings = np.load('data_mail1/email_embedding.npy')
# print(embeddings)

In [14]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    recall_score,
    precision_score,
    f1_score,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import joblib  # For saving models

In [15]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, df['label'], test_size=0.2, random_state=42
)

X_train.shape, y_train.shape

((17488, 384), (17488,))

In [16]:
def evaluate_model(model_name, y_test, y_pred, y_proba, results_dict):
    # Calculate metrics
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    accuracy = (y_pred == y_test).mean()
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    pr_auc = average_precision_score(y_test, y_proba) if y_proba is not None else None

    # Save metrics to results_dict
    results_dict[model_name] = {
        'recall': recall,
        'precision': precision,
        'f1_score': f1,
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc
    }

    # Print classification report
    print(f"\n--- {model_name} ---")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    # Plot and save confusion matrix
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f"{model_name}_confusion_matrix.png")
    plt.close()

    # Plot and save Precision-Recall curve            # UNVOMMENT THIS LATER
    if y_proba is not None:
        from sklearn.metrics import precision_recall_curve
        precision_vals, recall_vals, thresholds = precision_recall_curve(y_test, y_proba)
        plt.figure()
        plt.plot(recall_vals, precision_vals, marker='.', label=model_name)
        plt.title(f'Precision-Recall Curve for {model_name}')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"{model_name}_pr_curve.png")
        plt.close()

In [17]:
# Logistic Regression
lr_params = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs']
}

# Support Vector Machine
svm_params = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__class_weight': [None, 'balanced']
}

# Random Forest
rf_params = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__class_weight': [None, 'balanced']
}

# XGBoost
scale_pos_weight = ((len(y_train) - sum(y_train)) / sum(y_train)) if sum(y_train) > 0 else 1
xgb_params = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 6],
    'classifier__scale_pos_weight': [scale_pos_weight],
    'classifier__learning_rate': [0.01, 0.1]
}

In [18]:
def train_and_evaluate(model, params, model_name, results_dict):
    print(f"\n--- Training {model_name} ---")

    # Create pipeline
    pipeline = Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])

    # Cross-validation strategy
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    # Set up GridSearchCV
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=params,
        scoring='f1',  # Focus on f1
        cv=cv,
        n_jobs=-1
    )

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Best estimator
    best_pipeline = grid_search.best_estimator_
    best_model = best_pipeline.named_steps['classifier']

    # Predict on test set
    y_pred = best_pipeline.predict(X_test)
    if hasattr(best_pipeline.named_steps['classifier'], "predict_proba"):
        y_proba = best_pipeline.predict_proba(X_test)[:, 1]
    else:
        decision_scores = best_pipeline.decision_function(X_test)
        # Normalize decision scores to [0,1] range
        y_proba = (decision_scores - decision_scores.min()) / (decision_scores.max() - decision_scores.min())

    # Evaluate the model
    evaluate_model(model_name, y_test, y_pred, y_proba, results_dict)

    # Save the best model
    joblib.dump(best_pipeline, f"{model_name}_best_model.joblib")

    # Save best parameters
    with open(f"{model_name}_best_params.txt", 'w') as f:
        f.write(str(grid_search.best_params_))

    print("Best Parameters:", grid_search.best_params_)

In [19]:
results = {}

In [20]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
train_and_evaluate(lr_model, lr_params, "Logistic Regression", results)

# Support Vector Machine
svm_model = SVC(probability=True, random_state=42)
train_and_evaluate(svm_model, svm_params, "Support Vector Machine", results)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
train_and_evaluate(rf_model, rf_params, "Random Forest", results)

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
train_and_evaluate(xgb_model, xgb_params, "XGBoost", results)


--- Training Logistic Regression ---


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


--- Logistic Regression ---
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.91      0.95      4296
           1       0.11      0.62      0.19        77

    accuracy                           0.91      4373
   macro avg       0.55      0.77      0.57      4373
weighted avg       0.98      0.91      0.94      4373

Confusion Matrix:
[[3917  379]
 [  29   48]]
Best Parameters: {'classifier__C': 10, 'classifier__class_weight': None, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}

--- Training Support Vector Machine ---

--- Support Vector Machine ---
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      4296
           1       0.24      0.48      0.32        77

    accuracy                           0.96      4373
   macro avg       0.61      0.73      0.65      4373
weighted avg       0.98      0.96      0.97      4373

Confusion Matrix:
[[

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


--- Random Forest ---
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      4296
           1       0.23      0.47      0.31        77

    accuracy                           0.96      4373
   macro avg       0.61      0.72      0.65      4373
weighted avg       0.98      0.96      0.97      4373

Confusion Matrix:
[[4178  118]
 [  41   36]]
Best Parameters: {'classifier__class_weight': None, 'classifier__max_depth': 10, 'classifier__n_estimators': 100}

--- Training XGBoost ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


--- XGBoost ---
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      4296
           1       0.21      0.55      0.30        77

    accuracy                           0.96      4373
   macro avg       0.60      0.75      0.64      4373
weighted avg       0.98      0.96      0.97      4373

Confusion Matrix:
[[4138  158]
 [  35   42]]
Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 6, 'classifier__n_estimators': 200, 'classifier__scale_pos_weight': 49.83720930232558}


In [21]:
# Convert results dictionary to DataFrame
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.to_csv('model_results.csv')

print("\n--- Summary of Results ---")
print(results_df)


--- Summary of Results ---
                          recall  precision  f1_score  accuracy   roc_auc  \
Logistic Regression     0.623377   0.112412  0.190476  0.906700  0.865205   
Support Vector Machine  0.480519   0.238710  0.318966  0.963869  0.894115   
Random Forest           0.467532   0.233766  0.311688  0.963641  0.908415   
XGBoost                 0.545455   0.210000  0.303249  0.955866  0.882331   

                          pr_auc  
Logistic Regression     0.244240  
Support Vector Machine  0.235190  
Random Forest           0.203868  
XGBoost                 0.259782  


In [22]:
# Plot Recall Scores
plt.figure()
results_df['recall'].plot(kind='bar')
plt.title('Recall Scores of Models')
plt.ylabel('Recall')
plt.tight_layout()
plt.savefig('model_recall_scores.png')
plt.close()

## SAME BUT WITH ML FLOW

In [4]:
# import mlflow
# import mlflow.sklearn
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import (
#     classification_report,
#     confusion_matrix,
#     roc_auc_score,
#     average_precision_score,
#     recall_score,
#     precision_score,
#     f1_score
# )
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from imblearn.over_sampling import SMOTE

# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, classification_report

In [3]:
# def train_and_evaluate(model, params, model_name):
#     with mlflow.start_run(run_name=model_name):
#         print(f"\n--- Training {model_name} ---")
        
#         # # Apply SMOTE to training data                # UNCOMMENT THIS SECTION
#         # smote = SMOTE(random_state=42)
#         # X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        
#         # Set up GridSearchCV
#         grid_search = GridSearchCV(
#             estimator=model,
#             param_grid=params,
#             scoring='recall',  # Focus on recall
#             cv=5,
#             n_jobs=-1
#         )
        
#         grid_search.fit(X_train, y_train)
        
#         # Best estimator
#         best_model = grid_search.best_estimator_
        
#         # Predict on test set
#         y_pred = best_model.predict(X_test)
#         y_proba = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None
        
#         # Evaluation Metrics
#         recall = recall_score(y_test, y_pred)
#         precision = precision_score(y_test, y_pred)
#         f1 = f1_score(y_test, y_pred)
#         # roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None                 # UNCOMMENT THIS SECTION
#         # pr_auc = average_precision_score(y_test, y_proba) if y_proba is not None else None
        
#         # Log parameters and metrics to MLflow
#         mlflow.log_params(grid_search.best_params_)
#         mlflow.log_metric("recall", recall)
#         mlflow.log_metric("precision", precision)
#         mlflow.log_metric("f1_score", f1)
#         # if roc_auc is not None:                           # UNCOMMENT THIS SECTION 
#         #     mlflow.log_metric("roc_auc", roc_auc)
#         # if pr_auc is not None:
#         #     mlflow.log_metric("pr_auc", pr_auc)
        
#         # Log model
#         mlflow.sklearn.log_model(best_model, model_name)
        
#         # Print classification report
#         print("Best Parameters:", grid_search.best_params_)
#         print("Classification Report:")
#         print(classification_report(y_test, y_pred))
#         print("Confusion Matrix:")
#         print(confusion_matrix(y_test, y_pred))

In [2]:
# # Logistic Regression
# lr_params = {
#     'C': [0.01, 0.1, 1, 10],
#     'class_weight': [None, 'balanced'],
#     'penalty': ['l2'],
#     'solver': ['lbfgs']
# }

# # Support Vector Machine
# svm_params = {
#     'C': [0.1, 1, 10],
#     'kernel': ['linear', 'rbf'],
#     'class_weight': [None, 'balanced']
# }

# # Random Forest
# rf_params = {
#     'n_estimators': [100, 200],
#     'max_depth': [None, 10, 20],
#     'class_weight': [None, 'balanced']
# }

# # XGBoost
# xgb_params = {
#     'n_estimators': [100, 200],
#     'max_depth': [3, 6],
#     'scale_pos_weight': [(len(y_train) - sum(y_train)) / sum(y_train)],  # Adjust for imbalance
#     'learning_rate': [0.01, 0.1]
# }

In [1]:
# # Logistic Regression
# lr_model = LogisticRegression(max_iter=1000, random_state=42)
# train_and_evaluate(lr_model, lr_params, "Logistic Regression")

# # Support Vector Machine
# svm_model = SVC(probability=True, random_state=42)
# train_and_evaluate(svm_model, svm_params, "Support Vector Machine")

# # Random Forest
# rf_model = RandomForestClassifier(random_state=42)
# train_and_evaluate(rf_model, rf_params, "Random Forest")

# # XGBoost
# xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
# train_and_evaluate(xgb_model, xgb_params, "XGBoost")