In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ast
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from gensim.models import Word2Vec
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import torch
from torch.utils.data import DataLoader, TensorDataset

In [2]:
#  Load the data
domain1_train_data = pd.read_json('domain1_train_data.json', lines=True)
domain2_train_data = pd.read_json('domain2_train_data.json', lines=True)


print(domain1_train_data.head())
print(domain2_train_data.head())

# get machine and human data
machine = domain2_train_data[domain2_train_data['label'] == 0]
human = domain2_train_data[domain2_train_data['label'] == 1]

# count the number of samples in each class
n_machine = len(machine)
n_human = len(human)

# Filter machine data for samples with token length less than 1000
machine_filtered = machine[machine['text'].apply(len) < 2000]

# Count the number of samples again after filtering
n_machine_filtered = len(machine_filtered)

for label in [0, 1]:  # 0 代表 machine, 1 代表 human
    class_samples = domain1_train_data[domain1_train_data['label'] == label]
    n_samples = len(class_samples)
    if n_samples < 2600:
        extra_samples_needed = 2600 - n_samples
        sampled_data = class_samples.sample(n=extra_samples_needed, replace=True, random_state=42)
        domain1_train_data = pd.concat([domain1_train_data, sampled_data])

# Adjust machine_balanced to have exactly 2500 samples if possible
if n_machine_filtered > 2600:
    machine_balanced = machine_filtered.sample(n=2600, random_state=42)
else:
    machine_balanced = machine_filtered
    print("Note: Machine data has less than 2500 samples after filtering for token length.")

# Oversample human data to reach 2500 samples
if n_human < 2600:
    extra_samples_needed = 2600 - n_human
    human_balanced = pd.concat([human, human.sample(n=extra_samples_needed, replace=True, random_state=42)])
else:
    human_balanced = human

# Combine the balanced data
domain2_train_data_balanced = pd.concat([machine_balanced, human_balanced])

# domain2 label count
domain2_train_data_balanced['label'].value_counts()

# combine the two datasets
combined_data = pd.concat([domain1_train_data, domain2_train_data_balanced])


# get the features and labels
X = combined_data['text']
y = combined_data['label']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001, random_state=42)

# split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.001, random_state=42)

X_train_str = [' '.join(map(str, lst)) for lst in X_train]
X_val_str = [' '.join(map(str, lst)) for lst in X_val]

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")




                                                text  label  id
0  [16, 231, 543, 5, 15, 43, 8282, 94, 231, 1129,...      1   0
1  [16, 4046, 138, 10, 2, 1809, 2007, 3763, 14, 4...      1   1
2  [1108, 16550, 3, 6168, 3, 160, 284, 19, 49, 46...      1   2
3  [1802, 27, 16, 25, 48, 451, 632, 3, 2, 2164, 2...      1   3
4  [16, 19, 302, 93, 97, 43, 952, 118, 1, 16, 528...      1   4
                                                text  label    id
0  [12, 920, 7, 1266, 28, 9884, 1640, 116, 11, 13...      1  5000
1  [783, 397, 253, 5797, 9379, 22, 793, 11838, 10...      1  5001
2  [888, 14851, 323, 9, 27, 1377, 584, 195, 3, 13...      1  5002
3  [228, 1161, 5815, 379, 9, 941, 10, 2, 316, 4, ...      1  5003
4  [736, 19, 37, 813, 45, 6723, 27, 626, 8, 2, 34...      1  5004
Training set size: 10578
Test set size: 11


In [3]:
# count domain1_train_data human and machine
human = combined_data[combined_data['label'] == 1]
machine = combined_data[combined_data['label'] == 0]

print(human)
print(machine)



                                                   text  label    id
0     [16, 231, 543, 5, 15, 43, 8282, 94, 231, 1129,...      1     0
1     [16, 4046, 138, 10, 2, 1809, 2007, 3763, 14, 4...      1     1
2     [1108, 16550, 3, 6168, 3, 160, 284, 19, 49, 46...      1     2
3     [1802, 27, 16, 25, 48, 451, 632, 3, 2, 2164, 2...      1     3
4     [16, 19, 302, 93, 97, 43, 952, 118, 1, 16, 528...      1     4
...                                                 ...    ...   ...
161   [1800, 7841, 8, 7, 4159, 4480, 2361, 87, 186, ...      1  5161
1403  [1291, 117, 100, 26, 344, 5, 600, 7925, 620, 6...      1  6403
86    [181, 117, 100, 14, 6369, 13, 22, 472, 8, 110,...      1  5086
54    [2, 136, 401, 6, 496, 1136, 8, 486, 117, 100, ...      1  5054
1499  [8, 15, 71, 12, 155, 27, 9189, 117, 67, 10, 10...      1  6499

[5400 rows x 3 columns]
                                                   text  label     id
2500  [20112, 31, 31, 31, 31, 31, 31, 497, 8, 15, 27...      0   2500
2501  [

In [4]:
# count domain2_train_data_balanced human and machine
human = domain2_train_data_balanced[domain2_train_data_balanced['label'] == 1]
machine = domain2_train_data_balanced[domain2_train_data_balanced['label'] == 0]

print(human)
print(machine)

                                                   text  label    id
0     [12, 920, 7, 1266, 28, 9884, 1640, 116, 11, 13...      1  5000
1     [783, 397, 253, 5797, 9379, 22, 793, 11838, 10...      1  5001
2     [888, 14851, 323, 9, 27, 1377, 584, 195, 3, 13...      1  5002
3     [228, 1161, 5815, 379, 9, 941, 10, 2, 316, 4, ...      1  5003
4     [736, 19, 37, 813, 45, 6723, 27, 626, 8, 2, 34...      1  5004
...                                                 ...    ...   ...
161   [1800, 7841, 8, 7, 4159, 4480, 2361, 87, 186, ...      1  5161
1403  [1291, 117, 100, 26, 344, 5, 600, 7925, 620, 6...      1  6403
86    [181, 117, 100, 14, 6369, 13, 22, 472, 8, 110,...      1  5086
54    [2, 136, 401, 6, 496, 1136, 8, 486, 117, 100, ...      1  5054
1499  [8, 15, 71, 12, 155, 27, 9189, 117, 67, 10, 10...      1  6499

[2800 rows x 3 columns]
                                                   text  label     id
7056  [2295, 96, 237, 2557, 3, 57, 518, 3, 2097, 307...      0  12056
3070  [

In [5]:
# tf-idf向量化
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
# vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train_str)
X_val_vec = vectorizer.transform(X_val_str)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Initialize models
log_reg_model = LogisticRegression(max_iter=1000)
# kernel = rbf
svm_model = SVC(kernel="rbf")
rf_model = RandomForestClassifier(n_estimators=350)
nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)

models = {
    "Logistic Regression": log_reg_model,
    "Support Vector Machine": svm_model,
    "Random Forest": rf_model,
    "Neural Network": nn_model
}

# Train and evaluate each model
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_vec, y_train)
    y_val_pred = model.predict(X_val_vec)
    print(f"{name} Performance:")
    print(classification_report(y_val, y_val_pred))


Training Logistic Regression...
Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.33      1.00      0.50         1
           1       1.00      0.80      0.89        10

    accuracy                           0.82        11
   macro avg       0.67      0.90      0.69        11
weighted avg       0.94      0.82      0.85        11

Training Support Vector Machine...


In [None]:
# # 读取测试数据
# test_data = pd.read_json('test_data.json', lines=True)

# # json文件是一个字典，其中包含一个键为'text'的项，其值是一个包含所有测试样本的列表
# test_texts = [' '.join(map(str, lst)) for lst in test_data['text']]

# # 使用相同的向量化器来转换测试数据
# X_test = vectorizer.transform(test_texts)

# # 使用模型进行预测
# predictions = rf_model.predict(X_test)

# # 创建一个数据框，其中'id'列是样本的ID（通常是一个从0开始的整数序列），'class'列是预测结果
# submission = pd.DataFrame({
#     'id': range(len(predictions)),
#     'class': predictions
# })

# submission.to_csv('new_results/RF24_3.csv', index=False)