In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle

#Training data path
file_path = '76000train.csv'
num_epochs = 50

#Read in chunks
def load_and_split_data(file_path, chunk_size=20000, test_size=0.2, random_state=42):
    train_texts = []
    test_texts = []
    train_labels = []
    test_labels = []
    chunks = pd.read_csv(file_path, chunksize=chunk_size)
    vectorizer = CountVectorizer()

    for chunk in tqdm(chunks, desc="Processing chunks"):
        chunk['paper_title'] = chunk['paper_title'].astype(str)
        chunk['paper_summary'] = chunk['paper_summary'].astype(str)
        chunk['author_keyword_json'] = chunk['author_keyword_json'].astype(str)
        texts = chunk.apply(lambda row: f"{row['paper_title']} {row['paper_summary']} {row['author_keyword_json']}", axis=1).tolist()
        labels = chunk['is_ai'].tolist()

        #Split training and test sets
        texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=test_size, random_state=random_state)
        train_texts.extend(texts_train)
        test_texts.extend(texts_test)
        train_labels.extend(labels_train)
        test_labels.extend(labels_test)

    # Vectorization
    X_train = vectorizer.fit_transform(train_texts).toarray()
    X_test = vectorizer.transform(test_texts).toarray()
    y_train = np.array(train_labels)
    y_test = np.array(test_labels)
    return X_train, X_test, y_train, y_test, vectorizer

print("Start loading and splitting data")
X_train, X_test, y_train, y_test, vectorizer = load_and_split_data(file_path)
print("Successfully loaded and partitioned dataset")

print("Start converting to PyTorch tensor")
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)
print("Successfully converting to PyTorch tensor")

#Loading Data Using DataLoader
batch_size = 32
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers=2, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.bn = nn.BatchNorm1d(hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.bn(out[:, -1, :])
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc(out)
        return out

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = X_train.shape[1]
hidden_dim = 128
output_dim = 2
model = LSTMClassifier(input_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []
all_predicted_test = []
all_actual_test = []

#Parameters can be adjusted
optimizer = optim.Adam(model.parameters(), lr=0.0005,weight_decay=1e-4)

print('input_dim=',input_dim)
print("Start Training")

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    correct_train = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{num_epochs}]")
    
    for batch_X, batch_y in progress_bar:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X.unsqueeze(1))
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted_train = torch.max(outputs.data, 1)
        correct_train += (predicted_train == batch_y).sum().item()

    train_loss /= len(train_loader.dataset)
    train_accuracy = 100 * correct_train / len(train_loader.dataset)
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)
    
    model.eval()  
    test_loss = 0.0
    correct_test = 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X.unsqueeze(1))
            test_loss += criterion(outputs, batch_y).item()
            _, predicted_test = torch.max(outputs.data, 1)
            correct_test += (predicted_test == batch_y).sum().item()
            all_predicted_test.extend(predicted_test.cpu().numpy())
            all_actual_test.extend(batch_y.cpu().numpy())

    test_loss /= len(test_loader.dataset)
    test_accuracy = 100 * correct_test / len(test_loader.dataset)
    test_losses.append(test_loss)
    test_accuracies.append(test_accuracy)
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

precision, recall, f1, _ = precision_recall_fscore_support(all_actual_test, all_predicted_test, average='binary')
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
        
# Plotting
epochs = range(1, num_epochs + 1)
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(epochs, train_accuracies, 'b', label='Train Accuracy')
plt.plot(epochs, test_accuracies, 'r', label='Test Accuracy')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, train_losses, 'b', label='Train Loss')
plt.plot(epochs, test_losses, 'r', label='Test Loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
# Save figures
plt.savefig('lstm_0.0005_1e-4.png')
plt.show()

# Save model
torch.save(model.state_dict(), "lstm_model.pth")

# Save vectorizer
with open('lstm_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)