In [2]:
import gc
import os
import wandb
import random
import requests
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

import torch
import torch.nn as nn
from tqdm import tqdm
from torch.optim import AdamW
from sklearn.metrics import roc_auc_score
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, AutoConfig, AutoModel, get_linear_schedule_with_warmup

import warnings

warnings.simplefilter('ignore')

## 配置plotly

In [3]:
# Set global template and layout colors
# 设置默认的图表模板为 "plotly_dark"
pio.templates.default = "plotly_dark"
# 设置图表的纸张背景颜色为深灰色 (#1F1F1F)
pio.templates[pio.templates.default].layout['paper_bgcolor'] = '#1F1F1F'
# 设置图表的绘图区域背景颜色为深灰色 (#1F1F1F)
pio.templates[pio.templates.default].layout['plot_bgcolor'] = '#1F1F1F'

## 配置分词器

In [4]:
# 定义一个名为 CONFIG 的类，用于存储配置参数
class CONFIG:
    # 随机种子，用于重现实验结果
    seed = 300
    # 折数，用于交叉验证
    num_fold = 3
    # 使用的预训练模型名称
    model = 'roberta-base'
    # 文本序列的最大长度
    max_len = 512
    # 训练时的批量大小
    train_batch_size = 16
    # 验证时的批量大小
    valid_batch_size = 16
    # 训练轮数（epochs）
    epochs = 2
    # 学习率
    learning_rate = 1e-5
    # 学习率调度器类型
    scheduler = 'linear'
    # 设备类型，如果可用 CUDA，则使用 'cuda'，否则使用 'cpu'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # 使用预训练模型的 tokenizer 来处理文本数据
    tokenizer = AutoTokenizer.from_pretrained(model)


# 将 tokenizer 保存到指定目录（'./tokenizer/'）
CONFIG.tokenizer.save_pretrained('./tokenizer/')

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.json',
 './tokenizer/merges.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

## 固定seed

In [7]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything(CONFIG.seed)

## 数据读取

In [12]:
df_pro = pd.read_csv(r'./datasets/train_prompts.csv')
df_ess = pd.read_csv(r'./datasets/train_essays.csv')

In [14]:
# https://www.kaggle.com/datasets/radek1/llm-generated-essays

## External Dataset
df_gpt_3_5 = pd.read_csv("datasets/ai_generated_train_essays.csv")
df_gpt_4 = pd.read_csv("datasets/ai_generated_train_essays_gpt-4.csv")

## Combining the original and external dataset
df_new = pd.concat([df_ess, df_gpt_3_5, df_gpt_4]).reset_index(drop=True)