## LightGBM-Based Fake Document Classification for ESA Texts using Feature Engineering and Multi-Pass Grok Inference

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import os
from pathlib import Path
import re

In [None]:
# Define paths to train and test sets
root = os.path.dirname(os.getcwd())

train_path = os.path.join(root,"data","train.csv")
test_path = os.path.join(root,"data","test.csv")

In [None]:
# Map 'grok_fake' column values from 'A'/'B' to 1/2
df = pd.read_csv(train_path)
df['grok_fake'] = df['grok_fake'].replace({'A': 1, 'B': 2})

print("Preview:")
display(df.head())

Preview:


  df['grok_fake'] = df['grok_fake'].replace({'A': 1, 'B': 2})


Unnamed: 0,id,text_A,text_B,real_text_id,text_A_length_chars,text_A_length_words,text_A_num_sentences,text_A_num_citations,text_A_avg_word_length,text_A_flesch_reading,...,grok_fake_2,grok_a_score_2,grok_b_score_2,grok_fake_3,grok_a_score_3,grok_b_score_3,grok_decision,grok_fake,grok_a_score,grok_b_score
0,0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,1,2196,295,10,0,6.308475,-13.815975,...,B,0.15,0.85,,,,B,2.0,0.15,0.85
1,1,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...,2,3124,460,9,0,5.704348,-16.449734,...,A,0.92,0.08,,,,A,1.0,0.92,0.08
2,2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,1,1139,158,7,0,6.094937,10.976899,...,B,0.05,0.95,,,,B,2.0,0.05,0.95
3,3,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...,2,1774,264,8,0,5.530303,13.433182,...,A,0.95,0.05,,,,A,1.0,0.92,0.08
4,4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,2,195,34,6,0,4.588235,61.742157,...,A,0.95,0.05,,,,A,1.0,0.99,0.01


In [None]:
# real_text_id: 1 = A is real, 2 = B is real
# Goal: 1 = B is fake, 0 = A is fake
df["target"] = (df["real_text_id"] == 1).astype(int)
print(df[["real_text_id", "target"]].head())


In [None]:
# Inspect the current columns:
columns_list = list(df.columns)
print(columns_list)

In [None]:
# Choose feature columns
feature_cols = ['grok_a_score_2', 'grok_b_score_2','text_A_length_chars', 'text_A_length_words', 'text_A_num_sentences', 'text_A_num_citations', 'text_A_avg_word_length', 'text_A_flesch_reading', 'text_B_length_chars', 'text_B_length_words', 'text_B_num_sentences', 'text_B_num_citations', 'text_B_avg_word_length', 'text_B_flesch_reading', 'length_chars_diff', 'length_words_diff', 'num_sentences_diff', 'num_citations_diff', 'avg_word_length_diff', 'flesch_reading_diff', 'cosine_sim_A_B','grok_a_score', 'grok_b_score', 'text_A_ends_with_punct', 'text_A_num_entities', 'text_A_num_numbers', 'text_A_longest_sentence', 'text_A_max_repeat_word', 'text_B_ends_with_punct', 'text_B_num_entities', 'text_B_num_numbers', 'text_B_longest_sentence', 'text_B_max_repeat_word', 'ends_with_punct_diff', 'num_numbers_diff', 'longest_sentence_diff', 'max_repeat_word_diff', 'jaccard_sim_A_B','grok_fake']

print("Available Features:", feature_cols)
print("Check Null-Values:")

display(df[feature_cols].isnull().sum())

In [None]:
# Split data into training and validation sets

X = df[feature_cols]
y = df["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=41, stratify=y)

print(f"Train: {X_train.shape}, Val: {X_val.shape}")

In [None]:
# Train LightGBM model

model = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.1,
    random_state=42
)
model.fit(X_train, y_train)


In [None]:
# Evaluate model on validation set
y_pred = model.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

In [None]:
# Feature importance plot
lgb.plot_importance(model, max_num_features=15)
plt.tight_layout()
plt.show()

In [None]:
# Load and prepare test data
test_df = pd.read_csv("../data/test.csv")
test_df['grok_fake'] = test_df['grok_fake'].replace({'A': 1, 'B': 2})
test_df[feature_cols] = test_df[feature_cols].fillna(-1)
preds = model.predict(test_df[feature_cols]).astype(int)

# Build submission
submission = pd.DataFrame({
    "id": test_df["id"],
    "fake_label": preds
})

# Map: 0 -> 2 (A is fake), 1 -> 1 (B is fake)
submission["real_text_id"] = submission["fake_label"].map({0: 2, 1: 1})
submission = submission[["id", "real_text_id"]]

# Determine the next submission filename in ../data/
def next_submission_filename(prefix="submission", ext=".csv", directory="../data"):
    dir_path = Path(directory)
    dir_path.mkdir(parents=True, exist_ok=True)
    nums = []
    for f in dir_path.glob(f"{prefix}_*{ext}"):
        m = re.fullmatch(rf"{re.escape(prefix)}_(\d+){re.escape(ext)}", f.name)
        if m:
            nums.append(int(m.group(1)))
    next_index = (max(nums) + 1) if nums else 1
    return dir_path / f"{prefix}_{next_index}{ext}"

out_path = next_submission_filename(prefix="submission", ext=".csv", directory="../data")
submission.to_csv(out_path, index=False)

print(f"Submission saved to: {out_path}")
display(submission.head())