In [68]:
import pandas as pd
import ast
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
import random
from nltk.tokenize import word_tokenize

In [69]:
#  Load the data
domain1_train_data = pd.read_json("domain1_train_data.json", lines=True)
domain2_train_data = pd.read_json("domain2_train_data.json", lines=True)

In [70]:
# get machine and human data
machine = domain2_train_data[domain2_train_data['label'] == 0]
human = domain2_train_data[domain2_train_data['label'] == 1]

# count the number of samples in each class
n_machine = len(machine)
n_human = len(human)

# if the number of samples in 'machine' is greater than the number of samples in 'human'
if n_machine > n_human:
    machine = machine.sample(n_human)

# combine the balanced data
domain2_train_data_balanced = pd.concat([machine, human])


In [71]:
#drop id columns
domain2_train_data_balanced = domain2_train_data_balanced.drop(columns='id')
domain1_train_data = domain1_train_data.drop(columns='id')

In [72]:
# combine the two datasets
combined_data = pd.concat([domain1_train_data, domain2_train_data_balanced])


# get the features and labels
X = combined_data['text']
y = combined_data['label']

#Upsampled domain 2
domain2_train_data_balanced_upsampled = resample(domain2_train_data_balanced,
                replace=True,
                n_samples=len(domain1_train_data),
                random_state=42)

#New upsampled data
combined_data2 = pd.concat([domain2_train_data_balanced_downsampled, domain1_train_data])

X_2 = combined_data2['text']
y_2 = combined_data2['label']


#separating domains in differents datasets
X_A = domain1_train_data["text"]
y_A = domain1_train_data["label"]

X_B = domain2_train_data_balanced_upsampled["text"]
y_B = domain2_train_data_balanced_upsampled["label"]


In [73]:
#Tf-Idf function 
vectorizer = TfidfVectorizer()

## Meta Model 1: Stacking Model Trained with Each Domain Separately

In [74]:
test_data = pd.read_json('test_data.json', lines=True)

X_test_str = [' '.join(map(str, lst)) for lst in test_data['text']]

# Split the data into training and testing sets for each domain
X_train_A, X_val_A, y_train_A, y_val_A = train_test_split(X_A, y_A, test_size=0.2, shuffle = True, random_state=5)
X_train_B, X_val_B, y_train_B, y_val_B = train_test_split(X_B, y_B, test_size=0.2, shuffle = True, random_state=5)


# Train base models on each domain
X_train_str_A = [' '.join(map(str, lst)) for lst in X_train_A]
X_val_str_A = [' '.join(map(str, lst)) for lst in X_val_A]
X_train_str_B = [' '.join(map(str, lst)) for lst in X_train_B]
X_val_str_B = [' '.join(map(str, lst)) for lst in X_val_B]

# tf-idf向量化
X_train_vec_A = vectorizer.fit_transform(X_train_str_A)
X_val_vec_A = vectorizer.transform(X_val_str_A)
X_train_vec_B = vectorizer.transform(X_train_str_B)
X_val_vec_B = vectorizer.transform(X_val_str_B)
X_test_vec = vectorizer.transform(X_test_str)

            # Train base models on each domain
rf_model_A = SVC()
rf_model_B = SVC()

rf_model_A.fit(X_train_vec_A, y_train_A)
rf_model_B.fit(X_train_vec_B, y_train_B)

# Make predictions using base models
preds_A = rf_model_A.predict(X_val_vec_A)
preds_B = rf_model_B.predict(X_val_vec_B)

# Combine predictions as features
X_meta_train = np.column_stack((preds_A, preds_B))

# Train a meta-model on the combined predictions
meta_model = MLPClassifier(random_state=1, max_iter=500, solver="sgd", activation= "tanh", hidden_layer_sizes=(15,))

# Fit the meta-model on the combined predictions
meta_model.fit(X_meta_train, y_val_A)  # Using y_test_A, but you can use either domain A or B labels
#final_preds = meta_model.predict(X_meta_test)
#outputaccuracy = accuracy_score(y_val_A, final_preds)
#print(outputaccuracy)

test_preds_A = rf_model_A.predict(X_val_vec_A)
test_preds_B = rf_model_B.predict(X_val_vec_B)

X_meta_test = np.column_stack((test_preds_A, test_preds_B))

pred_val = meta_model.predict(X_meta_test)
class_report = classification_report(y_val_A, pred_val)
print(class_report)
final_accuracy = accuracy_score(y_val_A, pred_val)

#Test
test_preds_A = rf_model_A.predict(X_test_vec)
test_preds_B = rf_model_B.predict(X_test_vec)
X_meta_test = np.column_stack((test_preds_A, test_preds_B))
final = meta_model.predict(X_meta_test)

# Evaluate the final model
submission = pd.DataFrame({
    'id': range(len(final)),
    'class': final
})

submission.to_csv('results/SVC_SVC_metamodel.csv', index=False)

              precision    recall  f1-score   support

           0       0.78      0.73      0.75       495
           1       0.75      0.79      0.77       505

    accuracy                           0.76      1000
   macro avg       0.76      0.76      0.76      1000
weighted avg       0.76      0.76      0.76      1000



## Meta Model 2: Stacking Model Trained with Concatenated Domains

In [75]:
# Split the data into training and testing sets for each domain
X_train, X_val, y_train, y_val = train_test_split(X_2, y_2, test_size=0.2, shuffle = True, random_state=2)
test_data = pd.read_json('test_data.json', lines=True)

X_train_str_entire = [' '.join(map(str, lst)) for lst in X_train]
X_val_str = [' '.join(map(str, lst)) for lst in X_val]
X_test_str = [' '.join(map(str, lst)) for lst in test_data['text']]

X_train_vec_entire = vectorizer.fit_transform(X_train_str_entire)
X_val_vec = vectorizer.transform(X_val_str)
X_test_vec = vectorizer.transform(X_test_str)

#X_train_vec_scaled_entire = scaler.fit_transform(X_train_vec_entire)
#X_test_vec_scaled = scaler.transform(X_test_vec)

# Train base models on each domain
rf_model_A = KNeighborsClassifier()
rf_model_B = GaussianNB()
rf_model_C = RandomForestClassifier(random_state=1)
rf_model_D = LogisticRegression(random_state=0)

rf_model_A.fit(X_train_vec_entire, y_train)
rf_model_B.fit(X_train_vec_entire.toarray(), y_train)
rf_model_C.fit(X_train_vec_entire, y_train)
rf_model_D.fit(X_train_vec_entire, y_train)

# Make predictions using base models
preds_A = rf_model_A.predict(X_val_vec)
preds_B = rf_model_B.predict(X_val_vec.toarray())
preds_C = rf_model_C.predict(X_val_vec)
preds_D = rf_model_D.predict(X_val_vec)

# Combine predictions as features
X_meta_train = np.column_stack((preds_A, preds_B,preds_C, preds_D))

# Train a meta-model on the combined predictions
meta_model = MLPClassifier(random_state=1, max_iter=500, solver='sgd', activation= "tanh", hidden_layer_sizes=(15,))

# Fit the meta-model on the combined predictions
meta_model.fit(X_meta_train, y_val)  # Using y_test_A, but you can use either domain A or B labels

# Make predictions on the testing set
# Combine predictions of base models for testing set as well
val_preds_A = rf_model_A.predict(X_val_vec)
val_preds_B = rf_model_B.predict(X_val_vec.toarray())
val_preds_C = rf_model_C.predict(X_val_vec)
val_preds_D = rf_model_D.predict(X_val_vec)

X_meta_test = np.column_stack((val_preds_A, val_preds_B, val_preds_C, val_preds_D))
# Make final predictions using the meta-model
pred_val = meta_model.predict(X_meta_test)
# Evaluate the final model
final_accuracy = accuracy_score(y_val, pred_val)  # Using y_test_A, but you can use either domain A or B labels
class_report = classification_report(y_val, pred_val)
print(class_report)

test_preds_A = rf_model_A.predict(X_test_vec)
test_preds_B = rf_model_B.predict(X_test_vec.toarray())
test_preds_C = rf_model_C.predict(X_test_vec)
test_preds_D = rf_model_D.predict(X_test_vec)

X_meta_test = np.column_stack((test_preds_A, test_preds_B, test_preds_C,test_preds_D))
# Make final predictions using the meta-model
final = meta_model.predict(X_meta_test)

# Evaluate the final model
submission = pd.DataFrame({
    'id': range(len(final)),
    'class': final
})

submission.to_csv('results/KNN_GNB_RN_LR_MM_MLP_metamodel.csv', index=False)

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      1018
           1       0.86      0.87      0.87       982

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000

