# Demo code for the email server honeypot

In [1]:
%load_ext autoreload
%autoreload 2
import argparse
from datetime import datetime
import logging

import pickle
import numpy as np
import pandas as pd
import torch
import dpp
import utils
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
)

# Set up logger/arg parser
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
MODEL_CLASSES = {"gpt2": (GPT2LMHeadModel, GPT2Tokenizer)}

model_name_or_path = './models/gpt2-email-body'
args = utils.get_parser(model_name_or_path = model_name_or_path)
args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()

logger.warning(
    "device: %s, n_gpu: %s, 16-bits training: %s",
    args.device,
    args.n_gpu,
    args.fp16,
)

utils.set_seed(args)



In [2]:
#!python3 -m spacy download en_core_web_sm

## Setup LogNormMix-Net TPP Model

In [3]:
# Config
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)
dataset_name = 'enron_email_dataset'  # run dpp.data.list_datasets() to see the list of available datasets

# Model config
## Marks
use_src_marks = True              # Use source marks
src_mark_embedding_size = 24          # Size of the src mark embedding (used as RNN input)
use_dst_marks = True                  # Use destination marks
dst_mark_embedding_size = 24          # Size of the dst mark embedding (used as RNN input)
shared_mark_embedding = False          # Should the source and destination marks share an embedding layer (note, embedding sizes must be the same, and have the same range)

context_size = 64                # Size of the RNN hidden vector
num_mix_components = 30           # Number of components for a mixture model
rnn_type = "LSTM"                  # What RNN to use as an encoder {"RNN", "GRU", "LSTM"}
meta_embedding_size = 16
num_meta_classes = 3
meta_type = 'basic'

# Training config
batch_size = 50       # Number of sequences in a batch

In [4]:
dataset = dpp.data.load_dataset(dataset_name)
d_train, d_val, d_test = dataset.train_val_test_split(seed=seed)

dl_train = d_train.get_dataloader(batch_size=batch_size, shuffle=True)
dl_val = d_val.get_dataloader(batch_size=batch_size, shuffle=False)
dl_test = d_test.get_dataloader(batch_size=batch_size, shuffle=False)

mean_log_inter_time, std_log_inter_time = d_train.get_inter_time_statistics()

tpp_model = dpp.models.LogNormMixNet(
    use_src_marks=use_src_marks,
    use_dst_marks=use_dst_marks,
    num_src_marks=d_train.num_src_marks,
    num_dst_marks=d_train.num_dst_marks,
    num_meta_classes=num_meta_classes,
    meta_type=meta_type,
    mean_log_inter_time=mean_log_inter_time,
    std_log_inter_time=std_log_inter_time,
    context_size=context_size,
    src_mark_embedding_size=src_mark_embedding_size,
    dst_mark_embedding_size=dst_mark_embedding_size,
    shared_mark_embedding = shared_mark_embedding,
    rnn_type=rnn_type,
    num_mix_components=num_mix_components,
    meta_embedding_size=meta_embedding_size
)

train_end: 86
val_end: 115


In [5]:
## LOAD MODEL PARAMS
tpp_model.load_state_dict(torch.load('./models/enron-event-predict-model'))

<All keys matched successfully>

# Email generation
#### **1. Data Preprocessing:**
   a) create training dataset for the intensity-free TPP model  
   b) create training dataset for finetuning the huggingface GPT2 model  
#### **2. Train the intensity-free TPP model:** LogNormMix-Net model.  
#### **3. Fine tune huggingface/transformers GPT2 model** on the Enron email text.  
#### **4. Generate email traffic:**   
   For each event:  
   i)  generate the timestamp, sender & recip set using the TPP model  
   ii) sample email thread type (new-thread, reply, fwd) based on sent email counts from training data.  
   iii) if reply/fwd: choose the most recent email tread appropriate for the recipients and email type.  
   iv) generate email text by passing the existing email thread to the GPT2 model and asking it to generate 2 sentences of text.  

In [7]:
# Initialize the model and tokenizer
args.model_type = args.model_type.lower()
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
model = model_class.from_pretrained(args.model_name_or_path)
model.to(args.device)

if args.fp16:
    model.half()

args.length = utils.adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings)

03/29/2022 17:09:04 - INFO - transformers.tokenization_utils_base -   Model name './models/gpt2-email-body' not found in model shortcut name list (gpt2, gpt2-medium, gpt2-large, gpt2-xl, distilgpt2). Assuming './models/gpt2-email-body' is a path, a model identifier, or url to a directory containing tokenizer files.
03/29/2022 17:09:04 - INFO - transformers.tokenization_utils_base -   Didn't find file ./models/gpt2-email-body/tokenizer.json. We won't load it.
03/29/2022 17:09:04 - INFO - transformers.tokenization_utils_base -   loading file ./models/gpt2-email-body/vocab.json
03/29/2022 17:09:04 - INFO - transformers.tokenization_utils_base -   loading file ./models/gpt2-email-body/merges.txt
03/29/2022 17:09:04 - INFO - transformers.tokenization_utils_base -   loading file ./models/gpt2-email-body/added_tokens.json
03/29/2022 17:09:04 - INFO - transformers.tokenization_utils_base -   loading file ./models/gpt2-email-body/special_tokens_map.json
03/29/2022 17:09:04 - INFO - transformers

In [8]:
# Use per-person proportions of new-thread, reply and forward emails from the training dataset
message_type_props = pd.read_csv('../data/data_for_simulation/message_count_types.csv')

### User faker generated employee identities
Using fake names is optional.

In [9]:
faker_db = pd.read_csv('../data/data_for_simulation/faker_employee_names.csv')

# Create id to name map
employee_name_map = dict(zip(faker_db.id, faker_db.first_name)) 
employee_surname_map = dict(zip(faker_db.id, faker_db.last_name))
employee_email_map = dict(zip(faker_db.id, faker_db.email)) 

In [10]:
employee_name_map

{0: 'Elizabeth',
 1: 'Joshua',
 2: 'Haley',
 3: 'Allison',
 4: 'Nathan',
 5: 'Faith',
 6: 'Sandra',
 7: 'John',
 8: 'Albert',
 9: 'Michael',
 10: 'Michelle',
 11: 'Rebekah',
 12: 'Steve',
 13: 'Linda',
 14: 'Andrew',
 15: 'Brian',
 16: 'William',
 17: 'Jill',
 18: 'John',
 19: 'Katelyn',
 20: 'Tiffany',
 21: 'Omar',
 22: 'Vincent',
 23: 'Kayla',
 24: 'Glenn',
 25: 'Lisa',
 26: 'Debbie',
 27: 'Jill',
 28: 'Dylan',
 29: 'Wendy',
 30: 'Kenneth',
 31: 'Tanya',
 32: 'Nicholas',
 33: 'Julie',
 34: 'Aaron',
 35: 'Lauren',
 36: 'Tonya',
 37: 'Travis',
 38: 'Christina',
 39: 'William',
 40: 'Samuel',
 41: 'Toni',
 42: 'Erik',
 43: 'Paul',
 44: 'Ronald',
 45: 'Angela',
 46: 'Joe',
 47: 'Benjamin',
 48: 'Michelle',
 49: 'Laura',
 50: 'Adam',
 51: 'Adrienne',
 52: 'Laura',
 53: 'William'}

## Run generation: sample recipients and email thread, and generate email content

In [11]:
# Generate emails
num_hours = 24*7*2
## put these in the config yaml?
subjects = "../data/data_for_simulation/subjects_by_ID.csv"
generated_emails_db, generated_emails_list, recipients_db = \
    utils.run_generation(num_hours, model, tpp_model, args, tokenizer, employee_name_map, employee_email_map, message_type_props, subjects)
## TODO: re-incorporate the times, and convert to DTs?
#emails['dt'] = datetime(2013, 3, 4, 0, 59).astimezone() + pd.to_timedelta(emails['scaled_ts'],'s')
#emails['datetime'] = emails['dt'].apply(lambda x: x.strftime('%a')) + "   "+ emails['dt'].apply(lambda x: datetime.strftime(x, '%d/%m/%y %I:%M %p'))



thread found




thread found




thread found




thread found




thread found




thread found




thread found




thread found




thread found




thread found




thread found




thread found
thread found




thread found




thread found




thread found




thread found
thread found




thread found




thread found




thread found




thread found




thread found




thread found




thread found




thread found




thread found




thread found




thread found




thread found




In [None]:
len(generated_emails_db)

In [None]:
generated_emails_db.to_csv("../data/generated_output/emails.csv")

In [None]:
recipients_db.to_csv("../data/generated_output/generated_recipients_db.csv")

## Demo

In [None]:
threads_to_preview = generated_emails_db[generated_emails_db.thread_length==2]['thread_id'].tolist()
thread = iter(threads_to_preview)

In [None]:
thread_id = next(thread)
print(f"EMAIL THREAD #: {thread_id}")
print(f"Sent: {generated_emails_db[(generated_emails_db.thread_id==thread_id)].tail(1)['date-time'].iloc[0]}")
print(f"Subject: {generated_emails_db[(generated_emails_db.thread_id==thread_id)]['subject'].iloc[0]}")
print(generated_emails_db[(generated_emails_db.thread_id==thread_id)].tail(1)['full_email_thread'].iloc[0])