In [14]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import random
from torchtext.vocab import build_vocab_from_iterator
from nltk.tokenize import word_tokenize
import nltk
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [15]:
#  Load the data
domain1_train_data = pd.read_json("domain1_train_data.json", lines=True)
domain2_train_data = pd.read_json("domain2_train_data.json", lines=True)


print(domain1_train_data.head())
print(domain2_train_data.head())

                                                text  label  id
0  [16, 231, 543, 5, 15, 43, 8282, 94, 231, 1129,...      1   0
1  [16, 4046, 138, 10, 2, 1809, 2007, 3763, 14, 4...      1   1
2  [1108, 16550, 3, 6168, 3, 160, 284, 19, 49, 46...      1   2
3  [1802, 27, 16, 25, 48, 451, 632, 3, 2, 2164, 2...      1   3
4  [16, 19, 302, 93, 97, 43, 952, 118, 1, 16, 528...      1   4
                                                text  label    id
0  [12, 920, 7, 1266, 28, 9884, 1640, 116, 11, 13...      1  5000
1  [783, 397, 253, 5797, 9379, 22, 793, 11838, 10...      1  5001
2  [888, 14851, 323, 9, 27, 1377, 584, 195, 3, 13...      1  5002
3  [228, 1161, 5815, 379, 9, 941, 10, 2, 316, 4, ...      1  5003
4  [736, 19, 37, 813, 45, 6723, 27, 626, 8, 2, 34...      1  5004


In [16]:
# get machine and human data
machine = domain2_train_data[domain2_train_data['label'] == 0]
human = domain2_train_data[domain2_train_data['label'] == 1]

# count the number of samples in each class
n_machine = len(machine)
n_human = len(human)

# if the number of samples in 'machine' is greater than the number of samples in 'human'
if n_machine > n_human:
    machine = machine.sample(n_human)

# combine the balanced data
domain2_train_data_balanced = pd.concat([machine, human])

In [17]:
domain2_train_data_balanced = domain2_train_data_balanced.drop(columns='id')
domain1_train_data = domain1_train_data.drop(columns='id')

In [18]:

device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

In [19]:
batch_size = 4
epochs = 35
embedding_dim = 20
hidden_dim = 50
max_len = 30

In [20]:


def seq_collate_batch(batch):
    label_list, text_list = [], []
    for  _text, _label in batch:
        label_list.append(_label)
        text_list.append(sequence_pipeline(_text))
    label_list = torch.tensor(label_list, dtype=torch.float32)

    # Truncate each sequence
    padded_sequences = []
    for seq in text_list:
        # Truncate if longer than max_len
        padded_seq = seq[:max_len]
         # Pad if shorter
        padded_seq += [padding_index] * (max_len - len(padded_seq))
        padded_sequences.append(torch.tensor(padded_seq))
    text_list = torch.stack(padded_sequences)
    # Stack all sequences into a single tensor
    return text_list.to(device), label_list.reshape(-1, 1).to(device)

def seq_collate_test(batch):
    text_list = []
    for  _text in batch:
        text_list.append(sequence_pipeline(_text))

    # Pad or truncate each sequence
    padded_sequences = []
    for seq in text_list:
        # Truncate if longer than max_len
        padded_seq = seq[:max_len]
         # Pad if shorter
        padded_seq += [padding_index] * (max_len - len(padded_seq))
        padded_sequences.append(torch.tensor(padded_seq))
    text_list = torch.stack(padded_sequences)
    # Stack all sequences into a single tensor
    return text_list.to(device)


In [21]:
class SimpleLSTMNetwork(nn.Module):

    def __init__(self, vocab_size, embedding_dim, padding_idx):
        super().__init__()
        self.embedding  = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm_layer = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.forward_layer = nn.Linear(hidden_dim, 1)

    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        h0 = torch.zeros((1, batch_size, hidden_dim)).to(device)
        c0 = torch.zeros((1, batch_size, hidden_dim)).to(device)
        hidden = (h0, c0)
        return hidden


    def forward(self, x):
        embedded = self.embedding(x)

        h0 = self.init_hidden(x.shape[0])
        output, (hidden, cell) = self.lstm_layer(embedded, h0)

        hidden = hidden[-1, :, :]
        logits = torch.sigmoid(self.forward_layer(hidden))
        return logits

In [22]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):

        pred = model(X)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch  == size - 1:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [23]:
def test_without_y(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    arr = []
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X in dataloader:
            pred = model(X)
            result = (pred>0.5).float()
            arr.append(result.data.cpu().numpy())
    return arr


In [24]:

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    model.eval()
    test_loss, correct = 0, 0
    test_preds = []
    test_targets = []
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            preds = torch.argmax(pred , dim=1)
            test_loss += loss_fn(pred, y).item()
            result = (pred>0.5).float()
            test_preds.extend(result.tolist())
            test_targets.extend(y.tolist())
            correct += (result == y).type(torch.float).sum().item()
    class_report = classification_report(test_targets, test_preds)
    print("Classification Report:")
    print(class_report)
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [25]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
domain2_train_data_balanced_upampled = resample(domain2_train_data_balanced,
                replace=True,
                n_samples=len(domain1_train_data),
                random_state=42)

combined_data = pd.concat([domain1_train_data, domain2_train_data_balanced_upampled])
X = combined_data['text']
y = combined_data['label']

from nltk.tokenize import word_tokenize
sequence_pipeline = lambda x: vocab(word_tokenize(x))

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2, shuffle=True)


X_train_str = [' '.join(map(str, lst)) for lst in X_train]
train_iter = X_train_str
def yield_tokens(data_iter):
    for line in data_iter:
        yield line.strip().split()


vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=('<unk>', '<pad>'))
vocab.set_default_index(vocab['<unk>'])
padding_index = vocab['0']
sequence_pipeline = lambda x: vocab(word_tokenize(x))

vectorizer_rnn = CountVectorizer(tokenizer=word_tokenize, vocabulary=vocab.get_stoi(), lowercase=True, ngram_range=(1,2))

X_val_str = [' '.join(map(str, lst)) for lst in X_val]
X_train_vec = vectorizer_rnn.fit_transform(X_train_str).toarray()
X_val_vec = vectorizer_rnn.transform(X_val_str).toarray()
test_data = pd.read_json('test_data.json', lines=True)
test_texts = [' '.join(map(str, lst)) for lst in test_data['text']]
X_test = vectorizer_rnn.transform(test_texts).toarray()
vocab_size = X_train_vec.shape[1]


train_dl_LSTM = DataLoader(list(zip(X_train_str, y_train)), batch_size= batch_size, collate_fn=seq_collate_batch, shuffle=True)
val_dl_LSTM = DataLoader(list(zip(X_val_str, y_val)), batch_size= batch_size, collate_fn=seq_collate_batch)
test_dl_LSTM = DataLoader(list(test_texts), batch_size = batch_size, collate_fn=seq_collate_test)

LSTM_Model = SimpleLSTMNetwork(vocab_size, embedding_dim, padding_index).to(device)
print(LSTM_Model)

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(LSTM_Model.parameters(), lr=0.001)
for t in range(epochs):
    print(f"Epoch {t + 1}\n-------------------------------")
    train(train_dl_LSTM , LSTM_Model, loss_fn, optimizer)
    test(val_dl_LSTM, LSTM_Model, loss_fn)
print("Done!")

print("final test:")
predictions_LSTM_Model = test_without_y(test_dl_LSTM, LSTM_Model, loss_fn)

predictions_LSTM_Model_list = []
for batch in predictions_LSTM_Model:
    for x in batch:
        predictions_LSTM_Model_list.append(x[0])

submission = pd.DataFrame({
    'id': range(len(predictions_LSTM_Model_list)),
    'class': predictions_LSTM_Model_list
    })
submission.to_csv('results/LSTMPytorchsModel.csv', index=False)



SimpleLSTMNetwork(
  (embedding): Embedding(38431, 20, padding_idx=0)
  (lstm_layer): LSTM(20, 50, batch_first=True)
  (forward_layer): Linear(in_features=50, out_features=1, bias=True)
)
Epoch 1
-------------------------------
Classification Report:
              precision    recall  f1-score   support

         0.0       0.55      0.72      0.63       983
         1.0       0.62      0.44      0.52      1017

    accuracy                           0.58      2000
   macro avg       0.59      0.58      0.57      2000
weighted avg       0.59      0.58      0.57      2000

Test Error: 
 Accuracy: 57.8%, Avg loss: 0.678957 

Epoch 2
-------------------------------
Classification Report:
              precision    recall  f1-score   support

         0.0       0.70      0.63      0.67       983
         1.0       0.68      0.74      0.71      1017

    accuracy                           0.69      2000
   macro avg       0.69      0.69      0.69      2000
weighted avg       0.69      0.69  