In [None]:
import torch
import torch.nn as nn
import pickle
import pandas as pd
from tqdm import tqdm 

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers=2, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.bn = nn.BatchNorm1d(hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.bn(out[:, -1, :])
        out = self.relu(out) 
        out = self.dropout(out)
        out = self.fc(out)
        return out

input_dim = 121853     #Must be consistent with training
hidden_dim = 128
output_dim = 2
model = LSTMClassifier(input_dim, hidden_dim, output_dim)

#Model path
model.load_state_dict(torch.load("lstm_model.pth"))
model.eval()

#Vectorizer path
with open('lstm_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

#Make sure the excel file is encoded in the right format, or manually add “encoding =” to this code.
def load_and_preprocess_inference_data(file_path,chunk_size=20000):
    chunks = pd.read_csv(file_path, chunksize=chunk_size)
    total_chunks = sum(1 for _ in pd.read_csv(file_path, chunksize=chunk_size))  
    for chunk in tqdm(chunks, total=total_chunks, desc="Processing chunks"):
        chunk['paper_title'] = chunk['paper_title'].astype(str)
        chunk['paper_summary'] = chunk['paper_summary'].astype(str)
        chunk['author_keyword_json'] = chunk['author_keyword_json'].astype(str)
        texts = chunk.apply(lambda row: f"{row['paper_title']} {row['paper_summary']} {row['author_keyword_json']}", axis=1).tolist()
        X = vectorizer.transform(texts).toarray()
        yield torch.tensor(X, dtype=torch.float32), chunk

def inference_in_chunks(file_path, output_path, chunk_size=20000):
    all_predictions = []
    all_chunks = []
    for X_inference, chunk in load_and_preprocess_inference_data(file_path, chunk_size):

        def predict(model, data):
            model.eval()
            with torch.no_grad():
                outputs = model(data.unsqueeze(1))
                _, predicted = torch.max(outputs, 1)
            return predicted.cpu().numpy()

        predictions = predict(model, X_inference)
        chunk['is_ai'] = predictions
        all_chunks.append(chunk)

    result = pd.concat(all_chunks, ignore_index=True)
    result.to_csv(output_path, index=False)
    
#Path to the CSV file to be inferred
file_path = 'C类.csv'

#Output file path
output_path = 'lstm_model_result_ClassC.csv'

inference_in_chunks(file_path, output_path)