In [1]:
import pandas as pd
import os
import csv

# ==========================================
# CONFIGURATION
# ==========================================
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"

# Define the two configurations we want to test
CONFIGS = {
    "RISKY (Original)": {
        "sep": "|",
        "quoting": csv.QUOTE_MINIMAL,  # The setting that caused the crash
        "on_bad_lines": "warn",        # Default behavior (might be 'error' in old pandas)
        "low_memory": False
    },
    "ROBUST (New)": {
        "sep": "|",
        "quoting": 3,                  # csv.QUOTE_NONE (The Fix)
        "on_bad_lines": "warn",
        "low_memory": False
    }
}

def count_rows(filepath, params, name):
    """Attempts to load a file and counts rows. Returns count or 'CRASHED'."""
    try:
        # We only load 1 column to make it fast, just to check parsing
        # We read the first column usually 'nct_id'
        df = pd.read_csv(filepath, usecols=[0], **params)
        return len(df)
    except Exception as e:
        return f"CRASHED ({str(e)[:50]}...)"

def run_diagnostic():
    print(f"--- DIAGNOSTIC REPORT ---")
    print(f"Data Source: {DATA_PATH}\n")

    files_to_test = ['studies.txt', 'interventions.txt']

    # Header for the table
    print(f"{'FILE':<20} | {'METHOD':<18} | {'STATUS':<15} | {'ROW COUNT'}")
    print("-" * 70)

    for filename in files_to_test:
        filepath = os.path.join(DATA_PATH, filename)

        if not os.path.exists(filepath):
            print(f"{filename:<20} | FILE NOT FOUND")
            continue

        # Test 1: Risky Method
        res_risky = count_rows(filepath, CONFIGS["RISKY (Original)"], "RISKY")
        print(f"{filename:<20} | {'RISKY (Original)':<18} | {'Check':<15} | {res_risky}")

        # Test 2: Robust Method
        res_robust = count_rows(filepath, CONFIGS["ROBUST (New)"], "ROBUST")
        print(f"{filename:<20} | {'ROBUST (New)':<18} | {'Check':<15} | {res_robust}")

        # Calculate Difference if both succeeded
        if isinstance(res_risky, int) and isinstance(res_robust, int):
            diff = res_robust - res_risky
            print(f"{' ':<20} | {'DIFFERENCE':<18} | {'LOSS':<15} | -{diff} rows")
        elif isinstance(res_risky, str) and "CRASH" in res_risky:
             print(f"{' ':<20} | {'DIFFERENCE':<18} | {'LOSS':<15} | TOTAL LOSS (Crash)")

        print("-" * 70)

if __name__ == "__main__":
    run_diagnostic()

--- DIAGNOSTIC REPORT ---
Data Source: /home/delaunan/code/delaunan/clintrialpredict/data

FILE                 | METHOD             | STATUS          | ROW COUNT
----------------------------------------------------------------------
studies.txt          | RISKY (Original)   | Check           | 558973
studies.txt          | ROBUST (New)       | Check           | 558973
                     | DIFFERENCE         | LOSS            | -0 rows
----------------------------------------------------------------------
interventions.txt    | RISKY (Original)   | Check           | 945857
interventions.txt    | ROBUST (New)       | Check           | 945857
                     | DIFFERENCE         | LOSS            | -0 rows
----------------------------------------------------------------------


In [2]:
import pandas as pd
import numpy as np
import os
import csv

# ==========================================
# 1. SETUP
# ==========================================
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"
TEST_FILENAME = "test_project_data_integrity.csv"
FULL_PATH = os.path.join(DATA_PATH, TEST_FILENAME)

# ==========================================
# 2. CREATE "TORTURE" DATA
# ==========================================
# We create 4 rows with specific "dangerous" text content
data = {
    'nct_id': ['NCT001', 'NCT002', 'NCT003', 'NCT004'],
    'target': [0, 1, 0, 1],

    # 1. Safe Row
    'official_title': ['Simple Study of Aspirin'],

    # 2. The "Quote" Trap (Contains " inside text)
    # Risk: Can confuse standard parsers
    'criteria': ['Inclusion: Patients with "severe" headaches.'],

    # 3. The "Pipe" Trap (Contains | inside text)
    # Risk: CRITICAL. If we use | as separator, this looks like a new column.
    'txt_tags': ['Drug A | Drug B | Drug C'],

    # 4. The "Newline" Trap (Contains \n inside text)
    # Risk: Can be interpreted as a new row
    'txt_int_names': ['Drug A\nDrug B\nDrug C']
}

# Fill missing spots to make dataframe complete
max_len = 4
for key in data:
    if len(data[key]) < max_len:
        data[key] = data[key] + ['Safe Text'] * (max_len - len(data[key]))

df_original = pd.DataFrame(data)

print(f":: Original Data Shape: {df_original.shape}")

# ==========================================
# 3. SIMULATE SAVE (Exactly as you should do it)
# ==========================================
print(f"\n>> Saving to {TEST_FILENAME} using sep='|'...")
# Note: We do NOT turn off quoting here. We let Pandas handle the saving safely.
df_original.to_csv(FULL_PATH, index=False, sep='|')

# ==========================================
# 4. SIMULATE LOAD (The Acid Test)
# ==========================================
# We test 2 Loading Methods

# Method A: The "Robust" Method (What you use for raw files)
# params: sep='|', quoting=3 (Ignore quotes)
print("\n[TEST A] Loading with 'Robust' method (quoting=3)...")
try:
    df_robust = pd.read_csv(
        FULL_PATH,
        sep='|',
        quoting=3,  # <--- The setting we are testing
        on_bad_lines='warn'
    )
    print(f"   Rows loaded: {len(df_robust)} / 4")
    if len(df_robust) < 4:
        print("   :warning: LOSS DETECTED! The 'Robust' method failed on processed data.")
    else:
        # Check if data is corrupted (columns shifted)
        # If the pipe row was split, the last column might be NaN or wrong
        if df_robust.iloc[2].isnull().any():
             print("   :x: CORRUPTION DETECTED: Data shifted due to pipes in text.")
        else:
             print("   :white_check_mark: Success.")
except Exception as e:
    print(f"   :x: CRASHED: {e}")

# Method B: The "Standard" Method
# params: sep='|', Standard Quoting (Respect quotes)
print("\n[TEST B] Loading with 'Standard' method (Default Quoting)...")
try:
    df_standard = pd.read_csv(
        FULL_PATH,
        sep='|',
        # We do NOT use quoting=3 here. We trust Pandas' default CSV behavior.
        on_bad_lines='warn'
    )
    print(f"   Rows loaded: {len(df_standard)} / 4")
    if len(df_standard) == 4:
        print("   :white_check_mark: Perfect Match.")
except Exception as e:
    print(f"   :x: CRASHED: {e}")

# ==========================================
# 5. CLEANUP
# ==========================================
# os.remove(FULL_PATH) # Uncomment to delete the test file

:: Original Data Shape: (4, 6)

>> Saving to test_project_data_integrity.csv using sep='|'...

[TEST A] Loading with 'Robust' method (quoting=3)...
   Rows loaded: 6 / 4
   :x: CORRUPTION DETECTED: Data shifted due to pipes in text.

[TEST B] Loading with 'Standard' method (Default Quoting)...
   Rows loaded: 4 / 4
   :white_check_mark: Perfect Match.


In [3]:
import pandas as pd
import os
import csv
import io

# ==========================================
# 1. CONFIGURATION
# ==========================================
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"
OUTPUT_CSV = "audit_project_data.csv"

# All files used in your ClinicalTrialLoader
FILES_TO_TEST = [
    'studies.txt',
    'interventions.txt',
    'countries.txt',
    'sponsors.txt',
    'designs.txt',
    'eligibilities.txt',
    'calculated_values.txt',
    'keywords.txt'
]

# ==========================================
# 2. DEFINING THE METHODS
# ==========================================
def get_row_count(filepath, method_name):
    """
    Attempts to count rows using a specific loading strategy.
    """
    try:
        if method_name == "RISKY":
            # The method that crashes on unbalanced quotes
            # We read only 1 column to make it fast
            df = pd.read_csv(
                filepath,
                sep='|',
                usecols=[0],
                quotechar='"',       # Standard quoting
                on_bad_lines='warn', # Skip bad lines
                low_memory=False
            )
        elif method_name == "ROBUST":
            # The method that ignores quotes (Your Fix)
            df = pd.read_csv(
                filepath,
                sep='|',
                usecols=[0],
                quoting=3,           # csv.QUOTE_NONE
                on_bad_lines='warn',
                low_memory=False
            )
        return len(df)
    except Exception as e:
        return "CRASHED"

# ==========================================
# 3. EXECUTION: STAGE 1 (RAW FILES)
# ==========================================
print("\n" + "="*80)
print(f"{'STAGE 1: RAW TXT LOAD AUDIT':^80}")
print("="*80)
print(f"{'FILENAME':<25} | {'RISKY (Original)':<18} | {'ROBUST (New)':<18} | {'DIFFERENCE'}")
print("-" * 80)

results = {}

for filename in FILES_TO_TEST:
    full_path = os.path.join(DATA_PATH, filename)

    if not os.path.exists(full_path):
        print(f"{filename:<25} | {'FILE NOT FOUND':<18} | {'-':<18} | -")
        continue

    # 1. Run Risky
    count_risky = get_row_count(full_path, "RISKY")

    # 2. Run Robust
    count_robust = get_row_count(full_path, "ROBUST")

    # 3. Compare
    diff_msg = ""
    if count_risky == "CRASHED":
        diff_msg = "!!! RISKY CRASHED"
    else:
        diff = count_robust - count_risky
        if diff == 0:
            diff_msg = "No Loss"
        elif diff > 0:
            diff_msg = f"Risky lost {diff} rows"
        else:
            diff_msg = f"Robust lost {abs(diff)} rows" # Unlikely

    print(f"{filename:<25} | {str(count_risky):<18} | {str(count_robust):<18} | {diff_msg}")

    # Store robust count for Stage 2 checks
    if isinstance(count_robust, int):
        results[filename] = count_robust

# ==========================================
# 4. EXECUTION: STAGE 2 (CSV CYCLE)
# ==========================================
print("\n" + "="*80)
print(f"{'STAGE 2: SAVE -> RELOAD STRESS TEST':^80}")
print("="*80)
print("Testing with 'studies.txt' (contains complex text)...\n")

studies_path = os.path.join(DATA_PATH, 'studies.txt')
save_path = os.path.join(DATA_PATH, OUTPUT_CSV)

if 'studies.txt' in results:
    try:
        # A. LOAD (Robust)
        # We load ALL columns this time to test text processing
        print("1. Loading studies.txt (Robust)...")
        df = pd.read_csv(studies_path, sep='|', quoting=3, on_bad_lines='warn', low_memory=False)
        original_count = len(df)
        print(f"   -> Loaded {original_count} rows.")

        # B. CLEAN (Simulating the _prepare_text step)
        print("2. Cleaning Data (Removing pipes '|' and newlines from text)...")
        # Identify object (text) columns
        text_cols = df.select_dtypes(include=['object']).columns
        for col in text_cols:
            # Fast vectorized replacement
            df[col] = df[col].astype(str).str.replace('|', ' ', regex=False).str.replace('\n', ' ', regex=False).str.replace('\r', ' ', regex=False)

        # C. SAVE
        print("3. Saving to CSV (sep='|')...")
        df.to_csv(save_path, sep='|', index=False)

        # D. RELOAD (Robust)
        print("4. Reloading from CSV (Robust)...")
        df_reloaded = pd.read_csv(save_path, sep='|', quoting=3, on_bad_lines='warn', low_memory=False)
        new_count = len(df_reloaded)
        print(f"   -> Reloaded {new_count} rows.")

        # E. VERDICT
        print("-" * 40)
        if original_count == new_count:
             print(f"VERDICT: :white_check_mark: 0 ROWS LOST. The pipeline is safe.")
        else:
             print(f"VERDICT: :x: LOST {original_count - new_count} ROWS.")

    except Exception as e:
        print(f"STAGE 2 FAILED: {e}")

else:
    print("Skipping Stage 2 (studies.txt failed to load in Stage 1).")

# Cleanup
if os.path.exists(save_path):
    os.remove(save_path)


                          STAGE 1: RAW TXT LOAD AUDIT                           
FILENAME                  | RISKY (Original)   | ROBUST (New)       | DIFFERENCE
--------------------------------------------------------------------------------
studies.txt               | 558973             | 558973             | No Loss
interventions.txt         | 945857             | 945857             | No Loss
countries.txt             | 763540             | 763540             | No Loss
sponsors.txt              | 894384             | 894384             | No Loss
designs.txt               | 554264             | 554264             | No Loss
eligibilities.txt         | 558028             | 558028             | No Loss
calculated_values.txt     | 558973             | 558973             | No Loss
keywords.txt              | 1466970            | 1466970            | No Loss

                      STAGE 2: SAVE -> RELOAD STRESS TEST                       
Testing with 'studies.txt' (contains complex text)

Skipping line 26917: expected 71 fields, saw 72
Skipping line 71531: expected 71 fields, saw 73
Skipping line 72332: expected 71 fields, saw 73
Skipping line 91121: expected 71 fields, saw 72
Skipping line 91997: expected 71 fields, saw 72
Skipping line 96504: expected 71 fields, saw 72
Skipping line 98819: expected 71 fields, saw 72
Skipping line 99685: expected 71 fields, saw 73
Skipping line 112093: expected 71 fields, saw 73
Skipping line 141292: expected 71 fields, saw 73
Skipping line 145008: expected 71 fields, saw 73
Skipping line 147915: expected 71 fields, saw 73
Skipping line 200367: expected 71 fields, saw 76
Skipping line 207631: expected 71 fields, saw 72
Skipping line 207734: expected 71 fields, saw 73
Skipping line 207742: expected 71 fields, saw 73
Skipping line 210198: expected 71 fields, saw 73
Skipping line 210646: expected 71 fields, saw 73
Skipping line 237825: expected 71 fields, saw 73
Skipping line 242137: expected 71 fields, saw 72
Skipping line 242709: expect

   -> Loaded 558918 rows.
2. Cleaning Data (Removing pipes '|' and newlines from text)...
3. Saving to CSV (sep='|')...
4. Reloading from CSV (Robust)...
   -> Reloaded 558918 rows.
----------------------------------------
VERDICT: :white_check_mark: 0 ROWS LOST. The pipeline is safe.


In [4]:
import pandas as pd
import os

# 1. SETUP
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"
FILE_NAME = "project_data.csv" # The file you just saved

# 2. THE ROBUST LOADER
# We use the exact same logic as your class to ensure 1-to-1 consistency.
def get_training_data():
    full_path = os.path.join(DATA_PATH, FILE_NAME)

    if not os.path.exists(full_path):
        print(f":x: Error: {full_path} not found.")
        return None

    print(f":hourglass_flowing_sand: Loading {FILE_NAME}...")

    df = pd.read_csv(
        full_path,
        sep='|',             # Matches the separator used in save()
        quoting=3,           # csv.QUOTE_NONE: Ignores all quotes (Safe because we removed pipes from text)
        on_bad_lines='warn', # Just in case, but shouldn't trigger if sanitation worked
        low_memory=False,    # Prevents dtypes warnings
        dtype=str            # Optional: Load as string first to be 100% safe, or let Pandas infer
    )

    # Optional: Convert numeric columns back to numbers if you used dtype=str
    # Example: df['target'] = pd.to_numeric(df['target'], errors='coerce')

    print(f":white_check_mark: Success! Loaded {len(df)} rows.")
    return df

# 3. EXECUTE
df = get_training_data()

# 4. QUICK CHECK (The "One-to-One" Verification)
if df is not None:
    print(f"\nShape: {df.shape}")
    print("Columns:", df.columns.tolist())

    # Check if text columns look right (not shifted)
    if 'txt_tags' in df.columns:
        print("\n--- Sample Text Check ---")
        print(df[['nct_id', 'txt_tags']].head(3))

:hourglass_flowing_sand: Loading project_data.csv...


Skipping line 17580: expected 1 fields, saw 2
Skipping line 28911: expected 1 fields, saw 2
Skipping line 44602: expected 1 fields, saw 7
Skipping line 69163: expected 1 fields, saw 2
Skipping line 70183: expected 1 fields, saw 4
Skipping line 79666: expected 1 fields, saw 3
Skipping line 103210: expected 1 fields, saw 3

  df = pd.read_csv(


:white_check_mark: Success! Loaded 105328 rows.

Shape: (105328, 1)
Columns: ['nct_id,start_date_type,start_date,study_type,overall_status,phase,number_of_arms,why_stopped,target,start_year,phase_ordinal,covid_exposure,includes_us,is_international,agency_class,allocation,intervention_model,primary_purpose,masking,gender,healthy_volunteers,adult,child,older_adult,num_primary_endpoints,best_pathology,therapeutic_area,therapeutic_subgroup_name,competition_broad,competition_niche,txt_tags,txt_criteria']


In [5]:
import pandas as pd
import os

# 1. SETUP
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"
FILE_NAME = "project_data.csv" # The file you just saved

# 2. THE ROBUST LOADER
# We use the exact same logic as your class to ensure 1-to-1 consistency.
def get_training_data():
    full_path = os.path.join(DATA_PATH, FILE_NAME)

    if not os.path.exists(full_path):
        print(f":x: Error: {full_path} not found.")
        return None

    print(f":hourglass_flowing_sand: Loading {FILE_NAME}...")

    df = pd.read_csv(
        full_path,
        sep='|',             # Matches the separator used in save()
        quoting=3,           # csv.QUOTE_NONE: Ignores all quotes (Safe because we removed pipes from text)
        on_bad_lines='warn', # Just in case, but shouldn't trigger if sanitation worked
        low_memory=False,    # Prevents dtypes warnings
        dtype=str            # Optional: Load as string first to be 100% safe, or let Pandas infer
    )

    # Optional: Convert numeric columns back to numbers if you used dtype=str
    # Example: df['target'] = pd.to_numeric(df['target'], errors='coerce')

    print(f":white_check_mark: Success! Loaded {len(df)} rows.")
    return df

# 3. EXECUTE
df = get_training_data()

# 4. QUICK CHECK (The "One-to-One" Verification)
if df is not None:
    print(f"\nShape: {df.shape}")
    print("Columns:", df.columns.tolist())

    # Check if text columns look right (not shifted)
    if 'txt_tags' in df.columns:
        print("\n--- Sample Text Check ---")
        print(df[['nct_id', 'txt_tags']].head(3))

:hourglass_flowing_sand: Loading project_data.csv...
:white_check_mark: Success! Loaded 105328 rows.

Shape: (105328, 1)
Columns: ['nct_id,start_date_type,start_date,study_type,overall_status,phase,number_of_arms,why_stopped,target,start_year,phase_ordinal,covid_exposure,includes_us,is_international,agency_class,allocation,intervention_model,primary_purpose,masking,gender,healthy_volunteers,adult,child,older_adult,num_primary_endpoints,best_pathology,therapeutic_area,therapeutic_subgroup_name,competition_broad,competition_niche,txt_tags,txt_criteria']


Skipping line 17580: expected 1 fields, saw 2
Skipping line 28911: expected 1 fields, saw 2
Skipping line 44602: expected 1 fields, saw 7
Skipping line 69163: expected 1 fields, saw 2
Skipping line 70183: expected 1 fields, saw 4
Skipping line 79666: expected 1 fields, saw 3
Skipping line 103210: expected 1 fields, saw 3

  df = pd.read_csv(


In [6]:
! pip install xgboost


Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m8.8 MB/s[0m  [33m0:00:13[0mm0:00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_x86_64.whl (296.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.8/296.8 MB[0m [31m9.1 MB/s[0m  [33m0:00:31[0mm0:00:01[0m00:01[0mm
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [xgboost]m1/2[0m [xgboost]
[1A[2KSuccessfully installed nvidia-nccl-cu12-2.28.9 xgboost-3.1.2
