In [1]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import random
from torchtext.vocab import build_vocab_from_iterator
from nltk.tokenize import word_tokenize
import nltk
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
#  Load the data
domain1_train_data = pd.read_json("domain1_train_data.json", lines=True)
domain2_train_data = pd.read_json("domain2_train_data.json", lines=True)


In [3]:
# get machine and human data
machine = domain2_train_data[domain2_train_data['label'] == 0]
human = domain2_train_data[domain2_train_data['label'] == 1]

# count the number of samples in each class
n_machine = len(machine)
n_human = len(human)

# if the number of samples in 'machine' is greater than the number of samples in 'human'
if n_machine > n_human:
    machine = machine.sample(n_human)

# combine the balanced data
domain2_train_data_balanced = pd.concat([machine, human])


In [4]:
#drop id columns
domain2_train_data_balanced = domain2_train_data_balanced.drop(columns='id')
domain1_train_data = domain1_train_data.drop(columns='id')

In [5]:

device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

In [6]:
batch_size = 4
epochs = 10
embedding_dim = 15
hidden_dim = 15

In [7]:
def bow_collate_batch(batch):
    label_list, text_list = [], []
    for  _text, _label in batch:
        label_list.append(_label)
        text_list.append(_text)
    label_list = torch.tensor(label_list, dtype=torch.float32)
    text_list = torch.tensor(text_list, dtype=torch.float32)

    return text_list.to(device), label_list.reshape(-1, 1).to(device)

def test_batch(batch):
    text_list = []
    for  _text in batch:
        text_list.append(_text)
    text_list = torch.tensor(text_list, dtype=torch.float32)


    return text_list.to(device)


In [8]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.hidden_layer = nn.Linear(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        out = torch.relu(self.hidden_layer(x))
        out = torch.sigmoid(self.output_layer(out))
        return out

In [9]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)

    model.train()
    for batch, (X, y) in enumerate(dataloader):

        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch  == size - 1:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [10]:
def test_without_y(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    arr = []
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X in dataloader:
            pred = model(X)
            result = (pred>0.5).float()
            arr.append(result.data.cpu().numpy())
    return arr


In [11]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)


    model.eval()
    test_loss, correct = 0, 0
    test_preds = []
    test_targets = []
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            preds = torch.argmax(pred , dim=1)
            test_loss += loss_fn(pred, y).item()
            result = (pred>0.5).float()
            test_preds.extend(result.tolist())
            test_targets.extend(y.tolist())
            correct += (result == y).type(torch.float).sum().item()
    class_report = classification_report(test_targets, test_preds)
    print("Classification Report:")
    print(class_report)
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [12]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
domain2_train_data_balanced_upampled = resample(domain2_train_data_balanced,
                replace=True,
                n_samples=len(domain1_train_data),
                random_state=42)

combined_data = pd.concat([domain1_train_data, domain2_train_data_balanced_upampled])
X = combined_data['text']
y = combined_data['label']

from nltk.tokenize import word_tokenize
sequence_pipeline = lambda x: vocab(word_tokenize(x))

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2, shuffle=True)
X_train_str_MLP = [' '.join(map(str, lst)) for lst in X_train]
X_val_str_MLP = [' '.join(map(str, lst)) for lst in X_val]
train_iter = X_train_str_MLP
def yield_tokens(data_iter):
    for line in data_iter:
        yield line.strip().split()

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=('<unk>', '<pad>'))
vocab.set_default_index(vocab['<unk>'])
padding_index = vocab['0']
vectorizer_mlp = TfidfVectorizer(tokenizer=word_tokenize, lowercase=True, ngram_range=(1,3))
X_train_vec_MLP = vectorizer_mlp.fit_transform(X_train_str_MLP).toarray()
X_val_vec_MLP = vectorizer_mlp.transform(X_val_str_MLP).toarray()
test_data = pd.read_json('test_data.json', lines=True)
test_texts = [' '.join(map(str, lst)) for lst in test_data['text']]
X_test = vectorizer_mlp.transform(test_texts).toarray()


train_dl_MLP = DataLoader(list(zip(X_train_vec_MLP, y_train)), batch_size=batch_size, collate_fn=bow_collate_batch, shuffle=True)
val_dl_MLP = DataLoader(list(zip(X_val_vec_MLP, y_val)), batch_size=batch_size, collate_fn=bow_collate_batch)
test_dl_MLP = DataLoader(list(X_test), batch_size= batch_size, collate_fn=test_batch)
vocab_size = X_train_vec_MLP.shape[1]
MLP_Model = MLP(vocab_size, 30, 1).to(device)
print(MLP_Model)

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(MLP_Model.parameters(), lr=0.001)
for t in range(epochs):
    print(f"Epoch {t + 1}\n-------------------------------")
    train(train_dl_MLP , MLP_Model, loss_fn, optimizer)
    test(val_dl_MLP, MLP_Model, loss_fn)
print("Done!")

print("final test:")
predictions_MLP = test_without_y(test_dl_MLP, MLP_Model, loss_fn)

predictions_MLP_Model = []
for batch in predictions_MLP:
    for x in batch:
        predictions_MLP_Model.append(x[0])

submission = pd.DataFrame({
    'id': range(len(predictions_MLP_Model)),
    'class': predictions_MLP_Model
    })
submission.to_csv('results/MLP_Model.csv', index=False)