# Fake-or-Real Detection — Rebuilt Notebook

This notebook reconstructs the original `fake_real_detection_bert.ipynb` using the project `process.md` as the source of truth. It contains environment checks, safe imports, a lightweight fallback predictor (TF-IDF + LogisticRegression) and hooks to run a Hugging Face transformer model if available. The goal is to provide a runnable, robust notebook for both quick smoke tests and full training/prediction when the environment has the required ML libraries.

Sections:
1. Environment and dependency checks
2. Safe imports (installs if missing)
3. Data loading helpers (reads `data/train` and `data/test` pairs)
4. Simple baseline predictor (TF-IDF + LogisticRegression) for smoke tests
5. Optional Hugging Face transformer prediction utilities (if `transformers` installed)
6. Submission helper

Note: This reconstructed notebook intentionally avoids long training runs. It includes code paths that gracefully fall back to lightweight methods if heavy packages or GPUs are unavailable.

In [None]:
# Section: Environment check
import sys, os, platform
print('Python:', sys.version.replace('
',' '))
print('Platform:', platform.platform())
print('CWD:', os.getcwd())
print('Notebook file:', __file__ if '__file__' in globals() else 'n/a')

In [None]:
# Section: Safe imports with optional pip install fallback
import importlib, subprocess, sys
def safe_import(pkg, import_name=None, install_name=None):
    import_name = import_name or pkg
    install_name = install_name or pkg
    try:
        module = importlib.import_module(import_name)
        print(f'Imported {import_name} (from {pkg})')
        return module
    except Exception as e:
        print(f'Module {import_name} not found: {e}. Attempting to pip install {install_name}...')
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', install_name])
        module = importlib.import_module(import_name)
        print(f'Installed and imported {import_name}')
        return module

# Try lightweight packages first
np = safe_import('numpy')
pd = safe_import('pandas')
sklearn = safe_import('sklearn', 'sklearn', 'scikit-learn')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Transformers and torch are optional; only installed if user allows heavy installs
HAS_TRANSFORMERS = True
try:
    transformers = importlib.import_module('transformers')
    torch = importlib.import_module('torch')
    print('Transformers and torch available')
except Exception as e:
    print('Transformers/torch not available locally.')
    HAS_TRANSFORMERS = False

# Print versions
import numpy as _np
import pandas as _pd
print('numpy', _np.__version__)
print('pandas', _pd.__version__)

In [None]:
# Section: Data helpers - locate data folder and read text pairs
from pathlib import Path
DATA_ROOT = Path('data')
TRAIN_CSV = DATA_ROOT / 'train.csv'
TRAIN_DIR = DATA_ROOT / 'train'
TEST_DIR = DATA_ROOT / 'test'

def read_pair_from_dir(article_dir):
    # article_dir is a Path to a folder containing file_1.txt and file_2.txt
    f1 = article_dir / 'file_1.txt'
    f2 = article_dir / 'file_2.txt'
    t1 = f1.read_text(encoding='utf-8') if f1.exists() else ''
    t2 = f2.read_text(encoding='utf-8') if f2.exists() else ''
    return t1, t2

def preview_data(n=5):
    if not TRAIN_DIR.exists():
        print('No train directory found at', TRAIN_DIR)
        return []
    items = list(sorted(TRAIN_DIR.iterdir()))[:n]
    rows = []
    for p in items:
        t1, t2 = read_pair_from_dir(p)
        rows.append({'id': p.name, 'text1': t1[:400], 'text2': t2[:400]})
    import pandas as pd
    return pd.DataFrame(rows)

# Show preview (if data exists)
preview_data(3)

In [None]:
# Section: Baseline predictor (TF-IDF + LogisticRegression) - suitable for smoke tests
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

def build_baseline_model():
    pipe = make_pipeline(TfidfVectorizer(max_features=20000, ngram_range=(1,2)),
                         LogisticRegression(max_iter=1000))
    return pipe

def prepare_baseline_dataset(max_samples=2000):
    # Read train.csv to obtain labels; fallback to building synthetic small dataset if missing
    if TRAIN_CSV.exists():
        df = pd.read_csv(TRAIN_CSV)
        rows = []
        for _, r in df.iterrows():
            aid = r['id']
            label = int(r['real_text_id'])
            p = TRAIN_DIR / str(aid)
            t1, t2 = read_pair_from_dir(p)
            # Create two examples per article where each sample is a text labeled real/not
            rows.append({'text': t1, 'label': 1 if label==1 else 0})
            rows.append({'text': t2, 'label': 1 if label==2 else 0})
        dataset = pd.DataFrame(rows)
        if len(dataset) > max_samples:
            dataset = dataset.sample(max_samples, random_state=0)
        X = dataset['text'].fillna('')
        y = dataset['label']
        return X, y
    else:
        # Synthetic tiny dataset for smoke testing
        texts = ['This is real news about finance and the economy.',
                 'Clickbait fake news with false claims and sensational language.',
                 'Official press release with verified quotes and data.',
                 'Unverified rumors and fabricated stories.']
        labels = [1,0,1,0]
        return pd.Series(texts), pd.Series(labels)

# Train baseline on small data and return the pipeline
X, y = prepare_baseline_dataset(500)
model = build_baseline_model()
model.fit(X, y)
print('Baseline model trained on', len(X), 'samples')

# Example: predict for a pair
def baseline_predict_pair(pipe, text1, text2):
    probs = pipe.predict_proba([text1, text2])
    # probs[:,1] is probability of being real
    p1, p2 = probs[0,1], probs[1,1]
    return 1 if p1>p2 else 2, max(p1,p2)

print(baseline_predict_pair(model, 'Official report with numbers and quotes.', 'Sensational rumor with no source.'))

In [None]:
# Section: Optional Transformers-based utilities (only if available)
HAS_HF = False
try:
    import transformers
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    import torch
    HAS_HF = True
    print('Hugging Face transformers available:', transformers.__version__)
except Exception as e:
    print('Transformers not available or failed to import:', e)

def load_transformer_model(model_path_or_name='bert-base-uncased', device=None):
    if not HAS_HF:
        raise RuntimeError('Transformers not available in this environment')
    device = device or ('cuda' if torch.cuda.is_available() else ('mps' if getattr(torch,'has_mps',False) else 'cpu'))
    tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_path_or_name)
    model.to(device)
    return model, tokenizer, device

def hf_predict_pair(model, tokenizer, text1, text2, device='cpu'):
    texts = [text1, text2]
    enc = tokenizer(texts, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
    enc = {k:v.to(device) for k,v in enc.items()}
    with torch.no_grad():
        outputs = model(**enc)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        # probs shape: (2,2) -> [sample, class] where class 1 is 'real' per training convention
        p1, p2 = probs[0,1].item(), probs[1,1].item()
        return (1 if p1>p2 else 2), max(p1,p2)

print('Transformer utilities ready (if available)')

In [None]:
# Section: Submission helper (writes a CSV with id, real_text_id)
def create_submission_from_df(predictions, out_path='bert_submission.csv'):
    # predictions: list of dicts with keys id, real_text_id, confidence(optional)
    out = pd.DataFrame(predictions)
    out = out[['id','real_text_id']].copy()
    out.to_csv(out_path, index=False)
    print('Wrote submission to', out_path)

# Example building predictions using baseline model on test dir (if exists)
def predict_test_with_baseline(pipe, test_dir=TEST_DIR, limit=None):
    preds = []
    if not test_dir.exists():
        print('No test dir at', test_dir); return preds
    items = list(sorted(test_dir.iterdir()))
    if limit: items = items[:limit]
    for p in items:
        t1, t2 = read_pair_from_dir(p)
        best, conf = baseline_predict_pair(pipe, t1, t2)
        preds.append({'id': p.name, 'real_text_id': best, 'confidence': float(conf)})
    return preds

print('Submission helper ready')

## How to run full training or transformer predictions

- To run full Hugging Face training or prediction: ensure `transformers` and `torch` are installed, place model checkpoints under `results/BERT-Base-Uncased-Best` (or pass a model name like `google/electra-base-discriminator`) and use `load_transformer_model()`.
- For quick smoke tests, use the baseline TF-IDF + LogisticRegression pipeline included above.

If you'd like, I can now run the notebook's key cells (env check, imports, baseline training & a smoke prediction) to validate this rebuilt notebook.