# Fake Text Detection - Training and Analysis

https://www.kaggle.com/competitions/fake-or-real-the-impostor-hunt

## 1. Setup and Imports

In [None]:
import os
import logging
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import learning_curve

from detector import FakeTextDetector
from utils import DatasetPaths

warnings.filterwarnings("ignore")
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Setup complete!")

## 2. Data Loading and Basic Analysis

In [None]:
base_dir = os.getcwd()
paths = DatasetPaths(
    train_dir=os.path.join(base_dir, "train"),
    test_dir=os.path.join(base_dir, "test"),
    train_csv_path=os.path.join(base_dir, "train.csv")
)

if not all(os.path.exists(p) for p in [paths.train_dir, paths.test_dir, paths.train_csv_path]):
    print("DS not found.")
    
df_train = pd.read_csv(paths.train_csv_path)
print(f"Training data shape: {df_train.shape}")
print(f"\nFirst 5 rows:")
df_train.head()

In [None]:
print("Training Data Analysis")
print(f"Total samples: {len(df_train)}")
print(f"Unique article IDs: {df_train['id'].nunique()}")
print(f"\nClass distribution:")
class_counts = df_train['real_text_id'].value_counts().sort_index()
print(class_counts)

class_1_ratio = class_counts[1] / len(df_train)
class_2_ratio = class_counts[2] / len(df_train)
print(f"\nClass 1 (real_text_id=1): {class_1_ratio:.1%}")
print(f"Class 2 (real_text_id=2): {class_2_ratio:.1%}")

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

class_counts.plot(kind='bar', ax=ax1, color=['skyblue', 'lightcoral'])
ax1.set_title('Class Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Real Text ID')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=0)

ax2.pie(class_counts.values, labels=[f'Class {i}' for i in class_counts.index], 
        autopct='%1.1f%%', colors=['skyblue', 'lightcoral'])
ax2.set_title('Class Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"Dataset is {'balanced' if abs(class_1_ratio - 0.5) < 0.1 else 'imbalanced'}")

## 3. Text Data Analysis

In [None]:
from utils import read_text_pair

detector = FakeTextDetector()

sample_stats = []
sample_texts = []

for i, row in df_train.head(5).iterrows():
    article_id = int(row['id'])
    real_text_id = row['real_text_id']
    
    text1, text2 = read_text_pair(paths.train_dir, article_id)
    
    real_text = text1 if real_text_id == 1 else text2
    fake_text = text2 if real_text_id == 1 else text1
    
    sample_texts.append({
        'article_id': article_id,
        'real_text': real_text[:200] + '...' if len(real_text) > 200 else real_text,
        'fake_text': fake_text[:200] + '...' if len(fake_text) > 200 else fake_text
    })
    
    sample_stats.append({
        'article_id': article_id,
        'real_text_length': len(real_text),
        'fake_text_length': len(fake_text),
        'real_word_count': len(real_text.split()),
        'fake_word_count': len(fake_text.split()),
        'length_ratio': len(real_text) / len(fake_text) if len(fake_text) > 0 else 0
    })

sample_df = pd.DataFrame(sample_stats)
print(sample_df.to_string(index=False))

In [None]:
print("\nSample Text Pairs")

for i, sample in enumerate(sample_texts[:2]):
    print(f"\nArticle {sample['article_id']}:")
    print(f"\nReal text:")
    print(f"{sample['real_text']}")
    print(f"\nFake text:")
    print(f"{sample['fake_text']}")
    print("-" * 60)

In [None]:
all_stats = []

for i, row in df_train.iterrows():
    if i % 20 == 0:
        print(f"Processed {i}/{len(df_train)} samples")
    
    article_id = int(row['id'])
    real_text_id = row['real_text_id']
    
    text1, text2 = read_text_pair(paths.train_dir, article_id)
    
    real_text = text1 if real_text_id == 1 else text2
    fake_text = text2 if real_text_id == 1 else text1
    
    all_stats.append({
        'article_id': article_id,
        'real_length': len(real_text),
        'fake_length': len(fake_text),
        'real_words': len(real_text.split()),
        'fake_words': len(fake_text.split()),
        'length_diff': abs(len(real_text) - len(fake_text)),
        'word_diff': abs(len(real_text.split()) - len(fake_text.split()))
    })

all_stats_df = pd.DataFrame(all_stats)
print("\n✅ Analysis complete!")

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

axes[0, 0].hist([all_stats_df['real_length'], all_stats_df['fake_length']], 
                bins=30, alpha=0.7, label=['Real', 'Fake'], color=['blue', 'red'])
axes[0, 0].set_title('Character Length Distribution', fontweight='bold')
axes[0, 0].set_xlabel('Characters')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()

axes[0, 1].hist([all_stats_df['real_words'], all_stats_df['fake_words']], 
                bins=30, alpha=0.7, label=['Real', 'Fake'], color=['blue', 'red'])
axes[0, 1].set_title('Word Count Distribution', fontweight='bold')
axes[0, 1].set_xlabel('Words')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()

axes[0, 2].hist(all_stats_df['length_diff'], bins=30, alpha=0.7, color='green')
axes[0, 2].set_title('Character Length Difference', fontweight='bold')
axes[0, 2].set_xlabel('Absolute Difference')
axes[0, 2].set_ylabel('Frequency')

axes[1, 0].hist(all_stats_df['word_diff'], bins=30, alpha=0.7, color='orange')
axes[1, 0].set_title('Word Count Difference', fontweight='bold')
axes[1, 0].set_xlabel('Absolute Difference')
axes[1, 0].set_ylabel('Frequency')

axes[1, 1].scatter(all_stats_df['real_length'], all_stats_df['fake_length'], alpha=0.6)
axes[1, 1].plot([0, all_stats_df[['real_length', 'fake_length']].max().max()], 
                [0, all_stats_df[['real_length', 'fake_length']].max().max()], 'r--', alpha=0.8)
axes[1, 1].set_title('Real vs Fake Text Length', fontweight='bold')
axes[1, 1].set_xlabel('Real Text Length')
axes[1, 1].set_ylabel('Fake Text Length')

data_to_plot = [all_stats_df['real_length'], all_stats_df['fake_length']]
axes[1, 2].boxplot(data_to_plot, labels=['Real', 'Fake'])
axes[1, 2].set_title('Length Distribution Comparison', fontweight='bold')
axes[1, 2].set_ylabel('Characters')

plt.tight_layout()
plt.show()

print("Summary Statistics")
print(f"Real text - Mean length: {all_stats_df['real_length'].mean():.0f}, Std: {all_stats_df['real_length'].std():.0f}")
print(f"Fake text - Mean length: {all_stats_df['fake_length'].mean():.0f}, Std: {all_stats_df['fake_length'].std():.0f}")
print(f"Mean length difference: {all_stats_df['length_diff'].mean():.0f}")
print(f"Mean word difference: {all_stats_df['word_diff'].mean():.0f}")

## 4. Model Training

In [None]:
print("Initializing Fake Text Detector")

detector = FakeTextDetector(
    n_splits=7,
    n_repeats=3,
    seed=42
)

print(f"Detector initialized with:")
print(f"   - Cross-validation folds: {detector.n_splits}")
print(f"   - Repetitions: {detector.n_repeats}")
print(f"   - Total training runs: {detector.n_splits * detector.n_repeats}")
print(f"   - Random seed: {detector.seed}")

In [None]:
submission_df, cv_scores = detector.train_and_predict(paths)

## 5. Results Analysis and Visualization

In [None]:
if cv_scores and any(cv_scores.values()):
    metrics_to_plot = [metric for metric, scores in cv_scores.items() if scores]
    
    if metrics_to_plot:
        fig, axes = plt.subplots(1, len(metrics_to_plot), figsize=(5*len(metrics_to_plot), 5))
        
        if len(metrics_to_plot) == 1:
            axes = [axes]
        
        for i, metric in enumerate(metrics_to_plot):
            scores = cv_scores[metric]
            
            # Box plot
            axes[i].boxplot([scores], labels=[metric.upper()])
            axes[i].set_title(f'{metric.upper()} Distribution', fontweight='bold')
            axes[i].set_ylabel('Score')
            axes[i].grid(True, alpha=0.3)
            
            # Add mean line
            mean_score = np.mean(scores)
            axes[i].axhline(y=mean_score, color='red', linestyle='--', alpha=0.7, 
                           label=f'Mean: {mean_score:.4f}')
            axes[i].legend()
        
        plt.tight_layout()
        plt.show()
else:
    print("No cross-validation scores available for visualization")

In [None]:
print(f"Submission shape: {submission_df.shape}")
print(f"\nPrediction distribution:")
pred_counts = submission_df['real_text_id'].value_counts().sort_index()
print(pred_counts)

pred_ratio_1 = pred_counts[1] / len(submission_df) if 1 in pred_counts else 0
pred_ratio_2 = pred_counts[2] / len(submission_df) if 2 in pred_counts else 0

print(f"\nPrediction ratios:")
print(f"Class 1 predictions: {pred_ratio_1:.1%}")
print(f"Class 2 predictions: {pred_ratio_2:.1%}")

balance_diff = abs(pred_ratio_1 - 0.5)
if balance_diff < 0.1:
    balance_status = "Well balanced"
elif balance_diff < 0.2:
    balance_status = "Slightly imbalanced"
else:
    balance_status = "Highly imbalanced"

print(f"Balance status: {balance_status}")

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

pred_counts.plot(kind='bar', ax=ax1, color=['lightblue', 'lightgreen'])
ax1.set_title('Test Set Predictions', fontsize=14, fontweight='bold')
ax1.set_xlabel('Predicted Real Text ID')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=0)

ax2.pie(pred_counts.values, labels=[f'Pred {i}' for i in pred_counts.index], 
        autopct='%1.1f%%', colors=['lightblue', 'lightgreen'])
ax2.set_title('Test Predictions Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
train_ratios = df_train['real_text_id'].value_counts().sort_index() / len(df_train)
train_ratios.plot(kind='bar', ax=ax1, color=['skyblue', 'lightcoral'], alpha=0.8)
ax1.set_title('Training Set Distribution', fontweight='bold')
ax1.set_ylabel('Ratio')
ax1.tick_params(axis='x', rotation=0)

pred_ratios = submission_df['real_text_id'].value_counts().sort_index() / len(submission_df)
pred_ratios.plot(kind='bar', ax=ax2, color=['lightblue', 'lightgreen'], alpha=0.8)
ax2.set_title('Test Predictions Distribution', fontweight='bold')
ax2.set_ylabel('Ratio')
ax2.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

print("Distribution Comparison:")
print(f"Training - Class 1: {train_ratios[1]:.1%}, Class 2: {train_ratios[2]:.1%}")
print(f"Predictions - Class 1: {pred_ratios[1]:.1%}, Class 2: {pred_ratios[2]:.1%}")

## 6. Save Results

In [None]:
output_path = os.path.join(base_dir, "submission.csv")
submission_df.to_csv(output_path, index=False)

print(f"Submission saved to: {output_path}")
print(f"Submission preview:")
print(submission_df.head(10))

if 'accuracy' in cv_scores and cv_scores['accuracy']:
    print(f"🎯 Final CV Accuracy: {np.mean(cv_scores['accuracy']):.1%}")
print(f"Class balance: {pred_ratio_1:.1%} vs {pred_ratio_2:.1%}")