In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
# 获取数据索引文件
train_df=pd.read_csv("/kaggle/input/ai-vs-human-generated-dataset/train.csv")
test_df=pd.read_csv("/kaggle/input/ai-vs-human-generated-dataset/test.csv")
# 随机划分验证集
train_df, valid_df = train_test_split(train_df, test_size=0.3, random_state=42)

In [None]:
from torchvision import transforms
# 定义数据预处理
transform = transforms.Compose([
    transforms.Resize((384, 384)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # 通用 ImageNet 预训练均值和标准差
])

In [None]:
import torch
from torch.utils.data import Dataset
import os
from PIL import Image
# 定义设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 自定义数据集类
class TrainDataset(Dataset):
    def __init__(self,df):
        self.df=df
    def __getitem__(self,idx):
        img_path="/kaggle/input/ai-vs-human-generated-dataset/"
        img_path+=self.df.iloc[idx]['file_name']
        img=Image.open(img_path).convert("RGB")
        img_tensor=transform(img)
        label=self.df.iloc[idx]['label']
        return img_tensor,label
    def __len__(self):
        return len(self.df)

class TestDataset(Dataset):
    def __init__(self,df):
        self.df=df
    def __getitem__(self,idx):
        img_path="/kaggle/input/ai-vs-human-generated-dataset/"
        img_path+=self.df.iloc[idx]['id']
        img=Image.open(img_path).convert("RGB")
        img_tensor=transform(img)
        return img_tensor
    def __len__(self):
        return len(self.df)

# 自定义数据集
train_dataset=TrainDataset(train_df)
valid_dataset=TrainDataset(valid_df)
test_dataset=TestDataset(test_df)

In [None]:
from torch.utils.data import DataLoader
# 定义加载器
train_dataloader=DataLoader(train_dataset,batch_size=16,shuffle=True)
valid_dataloader=DataLoader(valid_dataset,batch_size=16,shuffle=True)
test_dataloader=DataLoader(test_dataset,batch_size=16,shuffle=False)

In [None]:
import tqdm
import timm
from torch import nn
from torch.optim.lr_scheduler import StepLR

# 定义模型
model = timm.create_model('tf_efficientnetv2_s.in21k_ft_in1k', 
                          pretrained=True,
                         num_classes=2).to(device)

# 定义损失函数、优化器、调度器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-2)
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)


In [None]:
from tqdm  import tqdm
def train(model,criterion,optimizer,scheduler,train_dataloader,valid_dataloader):
    total_loss = 0
    total=0
    tp=0
    fp=0
    fn=0
    model.train()
    s=0
    # 训练
    for batch,(tensor,label) in tqdm(enumerate(train_dataloader)):
        
        inputs=tensor.to(device)
        labels=label.to(device)
        total+=labels.size()[0]
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        predictions = torch.argmax(outputs, dim=1)
        for i in range(predictions.size()[0]):
            if(predictions[i].item() == labels[i].item() and labels[i].item()==1):
                tp+=1
            if(predictions[i].item() != labels[i].item() and labels[i].item()==1):
                fp+=1
            if(predictions[i].item() != labels[i].item() and labels[i].item()==0):
                fn+=1
        scheduler.step()
        s+=1
        if(s%1000==0):
            print(f"当前总精度为：{(total-fp-fn)/total:.4f}")
    
    p=tp/(tp+fp)
    r=tp/(tp+fn)
    f1=2*p*r/(p+r)
    # 验证
    model.eval()
    total_loss2 = 0
    total2=0
    tp2=0
    fp2=0
    fn2=0
    for batch,(tensor,label) in tqdm(enumerate(valid_dataloader)):
        
        inputs=tensor.to(device)
        labels=label.to(device)
        total2+=labels.size()[0]
        with torch.no_grad():
            outputs = model(inputs)
        loss = criterion(outputs, labels)

        total_loss2 += loss.item()
        predictions = torch.argmax(outputs, dim=1)
        for i in range(predictions.size()[0]):
            if(predictions[i].item() == labels[i].item() and labels[i].item()==1):
                tp2+=1
            if(predictions[i].item() != labels[i].item() and labels[i].item()==1):
                fp2+=1
            if(predictions[i].item() != labels[i].item() and labels[i].item()==0):
                fn2+=1
    p2=tp2/(tp2+fp2)
    r2=tp2/(tp2+fn2)
    f12=2*p2*r2/(p2+r2)
    return (total-fp-fn)/total,f1,(total2-fp2-fn2)/total2,f12


def test(model,criterion,optimizer,scheduler,test_dataloader,epoch):
    output=pd.DataFrame({'id':[],'label':[]})
    index=0
    for batch,tensor in tqdm(enumerate(test_dataloader)):
        model.eval()
        inputs=tensor.to(device)
        with torch.no_grad():
            outputs = model(inputs)
        predictions = torch.argmax(outputs, dim=1)
        for i in range(predictions.size()[0]):
            t={'id':test_df.iloc[index]['id'],'label':predictions[i].item()}
            output.loc[index]=t
            index+=1
    output.to_csv(f"output_{epoch+1}epoch.csv",index=False)

In [None]:
for epoch in range(0,12):
    train_acc,train_f1,valid_acc,valid_f1=train(model,criterion,optimizer,scheduler,train_dataloader,valid_dataloader)

    print(f"第{epoch+1}轮训练总精度为：{train_acc:.4f}")
    print(f"第{epoch+1}轮训练f1score为：{train_f1:.4f}")

    print(f"第{epoch+1}轮验证总精度为：{valid_acc:.4f}")
    print(f"第{epoch+1}轮验证f1score为：{valid_f1:.4f}")
    
    if((epoch+1)%3==0):
        test(model,criterion,optimizer,scheduler,test_dataloader,epoch)