In [101]:
%run ./seg_fraud.ipynb

DONE, SET LENGTH: 3234


In [102]:
%run ./seg_normal.ipynb

DONE, SET LENGTH: 13265


In [103]:
%run ./fish_cleaner.ipynb

In [104]:
f1 = open('fraud.txt', 'r')
f2 = open('enron.txt', 'r')

corpus_one = [item for item in f1]
corpus_two = [item for item in f2]

fs = split_set(corpus_one)
ns = split_enron_set(corpus_two)

fraud_set = []
norm_set = []

for i in range(len(fs)):
    fraud_set.append(super_clean(fs[i]))
for i in range(len(ns)):
    norm_set.append(super_clean(ns[i]))    

In [105]:
fraud_set = fraud_set[:6]
norm_set = norm_set[:6]

In [106]:
def label(fraud, norm):
    new_set = []
    for item in fraud:
        t = (item, 'fraud')
        new_set.append(t)
    for item in norm:
        t = (item, 'norm')
        new_set.append(t)
    return new_set    

In [107]:
x = label(fraud_set,norm_set)

#NECESSARY IMPORTS:
#torch
#pytorch-pretrained-bert
#torchnlp

In [108]:
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
import torch.utils.data

In [109]:
g = pd.DataFrame(x)

In [110]:
pd.set_option('display.max_columns', None)
df = g
print(df.head())


                                                   0      1
0                  FROM:MR. JAMES NGOLA.    URGEN...  fraud
1               Dear Friend,  I am Mr. Ben Sulema...  fraud
2   FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF E...  fraud
3   FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF E...  fraud
4   Dear sir,    It is with a heart full of hope ...  fraud


In [111]:
from collections import Counter
print(Counter(df[1].values))

Counter({'fraud': 6, 'norm': 6})


In [112]:
df.dropna(inplace = True)


In [113]:
df = df.rename(columns={0: "text", 1: "type"})

In [114]:
df = df.replace('pos', 'fraud')
df = df.replace('neg', 'norm')


In [115]:
df.head()

Unnamed: 0,text,type
0,FROM:MR. JAMES NGOLA. URGEN...,fraud
1,"Dear Friend, I am Mr. Ben Sulema...",fraud
2,FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF E...,fraud
3,FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF E...,fraud
4,"Dear sir, It is with a heart full of hope ...",fraud


In [116]:
df_fake = df[df['type'] == 'fraud'] 
df_statire = df[df['type'] == 'norm'] 
df_statire = df_statire.sample(n=len(df_fake))
df = df_statire.append(df_fake)
df = df.sample(frac=1, random_state = 24).reset_index(drop=True)

In [117]:
train_data = df.head(5)
test_data = df.tail(5)

In [118]:
train_data.head()

Unnamed: 0,text,type
0,PRESIDENT/MANAGING DIRECTOR Dear Sir/Mada...,fraud
1,"Here is our forecast """,norm
2,FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF E...,fraud
3,"Dear sir, It is with a heart full of hope ...",fraud
4,"Let's shoot for Tuesday at 11:45. "" ...",norm


In [119]:
test_data.head()

Unnamed: 0,text,type
7,"""file"",""message""",norm
8,"Dear Friend, I am Mr. Ben Sulema...",fraud
9,Traveling to have a business meeting takes th...,norm
10,"test successful. way to go!!!""",norm
11,"Randy, Can you send me a schedule of the sa...",norm


In [120]:
train_data = [{'text': text, 'type': type_data } for text in list(train_data['text']) for type_data in list(train_data['type'])]
test_data = [{'text': text, 'type': type_data } for text in list(test_data['text']) for type_data in list(test_data['type'])]

In [121]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['type']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['type']), test_data)))

In [122]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts))
train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))
train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")

In [123]:
print(len(train_tokens))
print(len(test_tokens))

25
25


In [124]:
print(train_tokens[0])

['[CLS]', 'president', '/', 'managing', 'director', 'dear', 'sir', '/', 'madam', ',', 'request', 'for', 'urgent', 'business', 'relationship', 'we', 'are', 'top', 'officials', 'of', 'the', 'federal', 'government', 'of', 'nigeria', 'contract', 'review', 'panel', 'who', 'are', 'interested', 'in', 'import', '##ation', 'of', 'goods', 'into', 'our', 'country', 'and', 'investing', 'abroad', 'with', 'funds', 'which', 'are', 'presently', 'trapped', 'in', 'nigeria', '.', 'in', 'order', 'to', 'commence', 'this', 'business', 'we', 'sol', '##ici', '##t', 'your', 'assistance', ',', 'knowledge', 'and', 'expertise', 'to', 'enable', 'us', 'rec', '##ie', '##ve', 'the', 'said', 'trapped', 'funds', 'abroad', ',', 'for', 'the', 'subsequent', 'purchase', 'and', 'inventory', 'of', 'the', 'goods', 'to', 'be', 'imported', 'and', 'the', 'investment', 'abroad', '.', 'previous', 'military', 'regimes', 'in', 'our', 'country', ',', 'government', 'officials', 'set', 'up', 'companies', 'and', 'awarded', 'themselves',

In [125]:
print(len(train_labels))
print(type(train_labels))

25
<class 'tuple'>


In [126]:
train_y = []
test_y = []

t1 = []
t2 = []

for item in train_labels:
    if str(item) == 'fraud': t1.append(1)
    else: t1.append(0)    

        

for item in test_labels:
    if str(item) == 'fraud': t2.append(1)
    else: t2.append(0)        

#train_y_step = np.array(train_labels) == 'fake'
#test_y_step = np.array(test_labels) == 'fake'



#for i in range(len(train_y_step)):
    #train_y.append(int(train_y_step[i]))
#for i in range(len(test_y_step)):
    #test_y.append(int(test_y_step[i]))
    
train_y = np.array(t1)
test_y = np.array(t2)

In [127]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
    
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [128]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [129]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

In [130]:
train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=1)
test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=1)

STATEMENT BELOW ITERATES THROUGH EACH ITEM IN THE TRAINING DATASET AT ONE EPOCH

In [131]:
BATCH_SIZE = 1
EPOCHS = 1
bert_clf = BertBinaryClassifier()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

Epoch:  1
0/25.0 loss: 0.9485976696014404 
Epoch:  1
1/25.0 loss: 0.9022595286369324 
Epoch:  1
2/25.0 loss: 0.8845586578051249 
Epoch:  1
3/25.0 loss: 0.8851230293512344 
Epoch:  1
4/25.0 loss: 0.8742474436759948 
Epoch:  1
5/25.0 loss: 0.8312385082244873 
Epoch:  1
6/25.0 loss: 0.8292135681424823 
Epoch:  1
7/25.0 loss: 0.793636217713356 
Epoch:  1
8/25.0 loss: 0.7951389816072252 
Epoch:  1
9/25.0 loss: 0.8045317590236664 
Epoch:  1
10/25.0 loss: 0.7953537377444181 
Epoch:  1
11/25.0 loss: 0.8049680640300115 
Epoch:  1
12/25.0 loss: 0.8032556955630963 
Epoch:  1
13/25.0 loss: 0.7851470283099583 
Epoch:  1
14/25.0 loss: 0.7668418884277344 
Epoch:  1
15/25.0 loss: 0.7542024962604046 
Epoch:  1
16/25.0 loss: 0.7707825232954586 
Epoch:  1
17/25.0 loss: 0.7781277696291605 
Epoch:  1
18/25.0 loss: 0.7848568997885051 
Epoch:  1
19/25.0 loss: 0.775133803486824 
Epoch:  1
20/25.0 loss: 0.7780556962603614 
Epoch:  1
21/25.0 loss: 0.7663847506046295 
Epoch:  1
22/25.0 loss: 0.7730974969656571 


In [132]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        
print(classification_report(test_y, bert_predicted))


              precision    recall  f1-score   support

           0       0.80      0.20      0.32        20
           1       0.20      0.80      0.32         5

   micro avg       0.32      0.32      0.32        25
   macro avg       0.50      0.50      0.32        25
weighted avg       0.68      0.32      0.32        25

