In [1]:
import pandas as pd
import os
import csv

# ==========================================
# CONFIGURATION
# ==========================================
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"

# Define the two configurations we want to test
CONFIGS = {
    "RISKY (Original)": {
        "sep": "|",
        "quoting": csv.QUOTE_MINIMAL,  # The setting that caused the crash
        "on_bad_lines": "warn",        # Default behavior (might be 'error' in old pandas)
        "low_memory": False
    },
    "ROBUST (New)": {
        "sep": "|",
        "quoting": 3,                  # csv.QUOTE_NONE (The Fix)
        "on_bad_lines": "warn",
        "low_memory": False
    }
}

def count_rows(filepath, params, name):
    """Attempts to load a file and counts rows. Returns count or 'CRASHED'."""
    try:
        # We only load 1 column to make it fast, just to check parsing
        # We read the first column usually 'nct_id'
        df = pd.read_csv(filepath, usecols=[0], **params)
        return len(df)
    except Exception as e:
        return f"CRASHED ({str(e)[:50]}...)"

def run_diagnostic():
    print(f"--- DIAGNOSTIC REPORT ---")
    print(f"Data Source: {DATA_PATH}\n")

    files_to_test = ['studies.txt', 'interventions.txt']

    # Header for the table
    print(f"{'FILE':<20} | {'METHOD':<18} | {'STATUS':<15} | {'ROW COUNT'}")
    print("-" * 70)

    for filename in files_to_test:
        filepath = os.path.join(DATA_PATH, filename)

        if not os.path.exists(filepath):
            print(f"{filename:<20} | FILE NOT FOUND")
            continue

        # Test 1: Risky Method
        res_risky = count_rows(filepath, CONFIGS["RISKY (Original)"], "RISKY")
        print(f"{filename:<20} | {'RISKY (Original)':<18} | {'Check':<15} | {res_risky}")

        # Test 2: Robust Method
        res_robust = count_rows(filepath, CONFIGS["ROBUST (New)"], "ROBUST")
        print(f"{filename:<20} | {'ROBUST (New)':<18} | {'Check':<15} | {res_robust}")

        # Calculate Difference if both succeeded
        if isinstance(res_risky, int) and isinstance(res_robust, int):
            diff = res_robust - res_risky
            print(f"{' ':<20} | {'DIFFERENCE':<18} | {'LOSS':<15} | -{diff} rows")
        elif isinstance(res_risky, str) and "CRASH" in res_risky:
             print(f"{' ':<20} | {'DIFFERENCE':<18} | {'LOSS':<15} | TOTAL LOSS (Crash)")

        print("-" * 70)

if __name__ == "__main__":
    run_diagnostic()

--- DIAGNOSTIC REPORT ---
Data Source: /home/delaunan/code/delaunan/clintrialpredict/data

FILE                 | METHOD             | STATUS          | ROW COUNT
----------------------------------------------------------------------
studies.txt          | RISKY (Original)   | Check           | 558973
studies.txt          | ROBUST (New)       | Check           | 558973
                     | DIFFERENCE         | LOSS            | -0 rows
----------------------------------------------------------------------
interventions.txt    | RISKY (Original)   | Check           | 945857
interventions.txt    | ROBUST (New)       | Check           | 945857
                     | DIFFERENCE         | LOSS            | -0 rows
----------------------------------------------------------------------


In [2]:
import pandas as pd
import numpy as np
import os
import csv

# ==========================================
# 1. SETUP
# ==========================================
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"
TEST_FILENAME = "test_project_data_integrity.csv"
FULL_PATH = os.path.join(DATA_PATH, TEST_FILENAME)

# ==========================================
# 2. CREATE "TORTURE" DATA
# ==========================================
# We create 4 rows with specific "dangerous" text content
data = {
    'nct_id': ['NCT001', 'NCT002', 'NCT003', 'NCT004'],
    'target': [0, 1, 0, 1],

    # 1. Safe Row
    'official_title': ['Simple Study of Aspirin'],

    # 2. The "Quote" Trap (Contains " inside text)
    # Risk: Can confuse standard parsers
    'criteria': ['Inclusion: Patients with "severe" headaches.'],

    # 3. The "Pipe" Trap (Contains | inside text)
    # Risk: CRITICAL. If we use | as separator, this looks like a new column.
    'txt_tags': ['Drug A | Drug B | Drug C'],

    # 4. The "Newline" Trap (Contains \n inside text)
    # Risk: Can be interpreted as a new row
    'txt_int_names': ['Drug A\nDrug B\nDrug C']
}

# Fill missing spots to make dataframe complete
max_len = 4
for key in data:
    if len(data[key]) < max_len:
        data[key] = data[key] + ['Safe Text'] * (max_len - len(data[key]))

df_original = pd.DataFrame(data)

print(f":: Original Data Shape: {df_original.shape}")

# ==========================================
# 3. SIMULATE SAVE (Exactly as you should do it)
# ==========================================
print(f"\n>> Saving to {TEST_FILENAME} using sep='|'...")
# Note: We do NOT turn off quoting here. We let Pandas handle the saving safely.
df_original.to_csv(FULL_PATH, index=False, sep='|')

# ==========================================
# 4. SIMULATE LOAD (The Acid Test)
# ==========================================
# We test 2 Loading Methods

# Method A: The "Robust" Method (What you use for raw files)
# params: sep='|', quoting=3 (Ignore quotes)
print("\n[TEST A] Loading with 'Robust' method (quoting=3)...")
try:
    df_robust = pd.read_csv(
        FULL_PATH,
        sep='|',
        quoting=3,  # <--- The setting we are testing
        on_bad_lines='warn'
    )
    print(f"   Rows loaded: {len(df_robust)} / 4")
    if len(df_robust) < 4:
        print("   :warning: LOSS DETECTED! The 'Robust' method failed on processed data.")
    else:
        # Check if data is corrupted (columns shifted)
        # If the pipe row was split, the last column might be NaN or wrong
        if df_robust.iloc[2].isnull().any():
             print("   :x: CORRUPTION DETECTED: Data shifted due to pipes in text.")
        else:
             print("   :white_check_mark: Success.")
except Exception as e:
    print(f"   :x: CRASHED: {e}")

# Method B: The "Standard" Method
# params: sep='|', Standard Quoting (Respect quotes)
print("\n[TEST B] Loading with 'Standard' method (Default Quoting)...")
try:
    df_standard = pd.read_csv(
        FULL_PATH,
        sep='|',
        # We do NOT use quoting=3 here. We trust Pandas' default CSV behavior.
        on_bad_lines='warn'
    )
    print(f"   Rows loaded: {len(df_standard)} / 4")
    if len(df_standard) == 4:
        print("   :white_check_mark: Perfect Match.")
except Exception as e:
    print(f"   :x: CRASHED: {e}")

# ==========================================
# 5. CLEANUP
# ==========================================
# os.remove(FULL_PATH) # Uncomment to delete the test file

:: Original Data Shape: (4, 6)

>> Saving to test_project_data_integrity.csv using sep='|'...

[TEST A] Loading with 'Robust' method (quoting=3)...
   Rows loaded: 6 / 4
   :x: CORRUPTION DETECTED: Data shifted due to pipes in text.

[TEST B] Loading with 'Standard' method (Default Quoting)...
   Rows loaded: 4 / 4
   :white_check_mark: Perfect Match.


In [3]:
import pandas as pd
import os
import csv
import io

# ==========================================
# 1. CONFIGURATION
# ==========================================
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"
OUTPUT_CSV = "audit_project_data.csv"

# All files used in your ClinicalTrialLoader
FILES_TO_TEST = [
    'studies.txt',
    'interventions.txt',
    'countries.txt',
    'sponsors.txt',
    'designs.txt',
    'eligibilities.txt',
    'calculated_values.txt',
    'keywords.txt'
]

# ==========================================
# 2. DEFINING THE METHODS
# ==========================================
def get_row_count(filepath, method_name):
    """
    Attempts to count rows using a specific loading strategy.
    """
    try:
        if method_name == "RISKY":
            # The method that crashes on unbalanced quotes
            # We read only 1 column to make it fast
            df = pd.read_csv(
                filepath,
                sep='|',
                usecols=[0],
                quotechar='"',       # Standard quoting
                on_bad_lines='warn', # Skip bad lines
                low_memory=False
            )
        elif method_name == "ROBUST":
            # The method that ignores quotes (Your Fix)
            df = pd.read_csv(
                filepath,
                sep='|',
                usecols=[0],
                quoting=3,           # csv.QUOTE_NONE
                on_bad_lines='warn',
                low_memory=False
            )
        return len(df)
    except Exception as e:
        return "CRASHED"

# ==========================================
# 3. EXECUTION: STAGE 1 (RAW FILES)
# ==========================================
print("\n" + "="*80)
print(f"{'STAGE 1: RAW TXT LOAD AUDIT':^80}")
print("="*80)
print(f"{'FILENAME':<25} | {'RISKY (Original)':<18} | {'ROBUST (New)':<18} | {'DIFFERENCE'}")
print("-" * 80)

results = {}

for filename in FILES_TO_TEST:
    full_path = os.path.join(DATA_PATH, filename)

    if not os.path.exists(full_path):
        print(f"{filename:<25} | {'FILE NOT FOUND':<18} | {'-':<18} | -")
        continue

    # 1. Run Risky
    count_risky = get_row_count(full_path, "RISKY")

    # 2. Run Robust
    count_robust = get_row_count(full_path, "ROBUST")

    # 3. Compare
    diff_msg = ""
    if count_risky == "CRASHED":
        diff_msg = "!!! RISKY CRASHED"
    else:
        diff = count_robust - count_risky
        if diff == 0:
            diff_msg = "No Loss"
        elif diff > 0:
            diff_msg = f"Risky lost {diff} rows"
        else:
            diff_msg = f"Robust lost {abs(diff)} rows" # Unlikely

    print(f"{filename:<25} | {str(count_risky):<18} | {str(count_robust):<18} | {diff_msg}")

    # Store robust count for Stage 2 checks
    if isinstance(count_robust, int):
        results[filename] = count_robust

# ==========================================
# 4. EXECUTION: STAGE 2 (CSV CYCLE)
# ==========================================
print("\n" + "="*80)
print(f"{'STAGE 2: SAVE -> RELOAD STRESS TEST':^80}")
print("="*80)
print("Testing with 'studies.txt' (contains complex text)...\n")

studies_path = os.path.join(DATA_PATH, 'studies.txt')
save_path = os.path.join(DATA_PATH, OUTPUT_CSV)

if 'studies.txt' in results:
    try:
        # A. LOAD (Robust)
        # We load ALL columns this time to test text processing
        print("1. Loading studies.txt (Robust)...")
        df = pd.read_csv(studies_path, sep='|', quoting=3, on_bad_lines='warn', low_memory=False)
        original_count = len(df)
        print(f"   -> Loaded {original_count} rows.")

        # B. CLEAN (Simulating the _prepare_text step)
        print("2. Cleaning Data (Removing pipes '|' and newlines from text)...")
        # Identify object (text) columns
        text_cols = df.select_dtypes(include=['object']).columns
        for col in text_cols:
            # Fast vectorized replacement
            df[col] = df[col].astype(str).str.replace('|', ' ', regex=False).str.replace('\n', ' ', regex=False).str.replace('\r', ' ', regex=False)

        # C. SAVE
        print("3. Saving to CSV (sep='|')...")
        df.to_csv(save_path, sep='|', index=False)

        # D. RELOAD (Robust)
        print("4. Reloading from CSV (Robust)...")
        df_reloaded = pd.read_csv(save_path, sep='|', quoting=3, on_bad_lines='warn', low_memory=False)
        new_count = len(df_reloaded)
        print(f"   -> Reloaded {new_count} rows.")

        # E. VERDICT
        print("-" * 40)
        if original_count == new_count:
             print(f"VERDICT: :white_check_mark: 0 ROWS LOST. The pipeline is safe.")
        else:
             print(f"VERDICT: :x: LOST {original_count - new_count} ROWS.")

    except Exception as e:
        print(f"STAGE 2 FAILED: {e}")

else:
    print("Skipping Stage 2 (studies.txt failed to load in Stage 1).")

# Cleanup
if os.path.exists(save_path):
    os.remove(save_path)


                          STAGE 1: RAW TXT LOAD AUDIT                           
FILENAME                  | RISKY (Original)   | ROBUST (New)       | DIFFERENCE
--------------------------------------------------------------------------------
studies.txt               | 558973             | 558973             | No Loss
interventions.txt         | 945857             | 945857             | No Loss
countries.txt             | 763540             | 763540             | No Loss
sponsors.txt              | 894384             | 894384             | No Loss
designs.txt               | 554264             | 554264             | No Loss
eligibilities.txt         | 558028             | 558028             | No Loss
calculated_values.txt     | 558973             | 558973             | No Loss
keywords.txt              | 1466970            | 1466970            | No Loss

                      STAGE 2: SAVE -> RELOAD STRESS TEST                       
Testing with 'studies.txt' (contains complex text)

Skipping line 26917: expected 71 fields, saw 72
Skipping line 71531: expected 71 fields, saw 73
Skipping line 72332: expected 71 fields, saw 73
Skipping line 91121: expected 71 fields, saw 72
Skipping line 91997: expected 71 fields, saw 72
Skipping line 96504: expected 71 fields, saw 72
Skipping line 98819: expected 71 fields, saw 72
Skipping line 99685: expected 71 fields, saw 73
Skipping line 112093: expected 71 fields, saw 73
Skipping line 141292: expected 71 fields, saw 73
Skipping line 145008: expected 71 fields, saw 73
Skipping line 147915: expected 71 fields, saw 73
Skipping line 200367: expected 71 fields, saw 76
Skipping line 207631: expected 71 fields, saw 72
Skipping line 207734: expected 71 fields, saw 73
Skipping line 207742: expected 71 fields, saw 73
Skipping line 210198: expected 71 fields, saw 73
Skipping line 210646: expected 71 fields, saw 73
Skipping line 237825: expected 71 fields, saw 73
Skipping line 242137: expected 71 fields, saw 72
Skipping line 242709: expect

   -> Loaded 558918 rows.
2. Cleaning Data (Removing pipes '|' and newlines from text)...
3. Saving to CSV (sep='|')...
4. Reloading from CSV (Robust)...
   -> Reloaded 558918 rows.
----------------------------------------
VERDICT: :white_check_mark: 0 ROWS LOST. The pipeline is safe.


In [1]:
import pandas as pd
import os
import csv

# ==========================================
# 1. CONFIGURATION
# ==========================================
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"
FILE_NAME = "studies.txt"
FULL_PATH = os.path.join(DATA_PATH, FILE_NAME)

def test_strategy(name, params):
    print(f"\nRunning {name}...")
    try:
        # Load ALL columns (no usecols) to trigger full parsing logic
        df = pd.read_csv(FULL_PATH, **params)
        count = len(df)
        print(f"   -> :white_check_mark: Success. Loaded {count:,} rows.")
        return count
    except Exception as e:
        print(f"   -> :x: CRASHED: {e}")
        return "CRASHED"

def run_test():
    print(f"--- FINAL LOADING SHOWDOWN: {FILE_NAME} ---")

    if not os.path.exists(FULL_PATH):
        print(f"Error: File not found at {FULL_PATH}")
        return

    # ---------------------------------------------------------
    # STRATEGY A: YOUR "OLD WAY" (Polite / Trust Quotes)
    # ---------------------------------------------------------
    # This tries to respect quotes. If it works, it handles pipes inside quotes correctly.
    params_old = {
        "sep": "|",
        "dtype": str,
        "header": 0,
        "quotechar": '"',
        "quoting": csv.QUOTE_MINIMAL,  # <--- The key difference
        "low_memory": False,
        "on_bad_lines": "warn"
    }
    count_old = test_strategy("OLD WAY (QUOTE_MINIMAL)", params_old)

    # ---------------------------------------------------------
    # STRATEGY B: MY "NEW WAY" (Robust / Ignore Quotes)
    # ---------------------------------------------------------
    # This ignores quotes to prevent EOF crashes, but might skip rows with pipes in text.
    params_new = {
        "sep": "|",
        "dtype": str,
        "header": 0,
        "quotechar": '"',
        "quoting": 3,                  # <--- csv.QUOTE_NONE
        "low_memory": False,
        "on_bad_lines": "warn"
    }
    count_new = test_strategy("NEW WAY (QUOTE_NONE / 3)", params_new)

    # ---------------------------------------------------------
    # THE VERDICT
    # ---------------------------------------------------------
    print("\n" + "="*40)
    print("VERDICT")
    print("="*40)

    if count_old == "CRASHED":
        print("Winner: NEW WAY.")
        print("Reason: The Old Way crashed on this file.")

    elif isinstance(count_old, int) and isinstance(count_new, int):
        if count_old > count_new:
            diff = count_old - count_new
            print(f"Winner: OLD WAY (+{diff} rows).")
            print("Reason: Your data had pipes inside quotes. The Old Way correctly kept them.")
            print("Recommendation: REVERT to your original code.")

        elif count_new > count_old:
            diff = count_new - count_old
            print(f"Winner: NEW WAY (+{diff} rows).")
            print("Reason: The Old Way likely dropped lines due to bad quoting.")

        else:
            print("Result: TIE.")
            print("Reason: Both methods loaded the exact same number of rows.")
            print("Recommendation: Use NEW WAY (quoting=3) as it is safer against future 'unexpected end of data' crashes.")

if __name__ == "__main__":
    run_test()

--- FINAL LOADING SHOWDOWN: studies.txt ---

Running OLD WAY (QUOTE_MINIMAL)...
   -> :white_check_mark: Success. Loaded 558,973 rows.

Running NEW WAY (QUOTE_NONE / 3)...


Skipping line 26917: expected 71 fields, saw 72
Skipping line 71531: expected 71 fields, saw 73
Skipping line 72332: expected 71 fields, saw 73
Skipping line 91121: expected 71 fields, saw 72
Skipping line 91997: expected 71 fields, saw 72
Skipping line 96504: expected 71 fields, saw 72
Skipping line 98819: expected 71 fields, saw 72
Skipping line 99685: expected 71 fields, saw 73
Skipping line 112093: expected 71 fields, saw 73
Skipping line 141292: expected 71 fields, saw 73
Skipping line 145008: expected 71 fields, saw 73
Skipping line 147915: expected 71 fields, saw 73
Skipping line 200367: expected 71 fields, saw 76
Skipping line 207631: expected 71 fields, saw 72
Skipping line 207734: expected 71 fields, saw 73
Skipping line 207742: expected 71 fields, saw 73
Skipping line 210198: expected 71 fields, saw 73
Skipping line 210646: expected 71 fields, saw 73
Skipping line 237825: expected 71 fields, saw 73
Skipping line 242137: expected 71 fields, saw 72
Skipping line 242709: expect

   -> :white_check_mark: Success. Loaded 558,918 rows.

VERDICT
Winner: OLD WAY (+55 rows).
Reason: Your data had pipes inside quotes. The Old Way correctly kept them.
Recommendation: REVERT to your original code.


In [2]:
import pandas as pd
import os

# CONFIGURATION
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"
FILE_NAME = "project_data.csv"
FULL_PATH = os.path.join(DATA_PATH, FILE_NAME)

def compare_csv_loading():
    print(f"--- CSV LOADING AUDIT: {FILE_NAME} ---")

    if not os.path.exists(FULL_PATH):
        print(f":x: Error: File not found at {FULL_PATH}")
        return

    # ---------------------------------------------------------
    # METHOD A: DEFAULT (Fast C Engine)
    # ---------------------------------------------------------
    print("\n1. Running Method A (Default C Engine)...")
    try:
        # Default behavior of pandas
        df_a = pd.read_csv(FULL_PATH)
        count_a = len(df_a)
        print(f"   -> Success. Rows: {count_a}")
    except Exception as e:
        count_a = "CRASHED"
        print(f"   -> :x: CRASHED: {e}")

    # ---------------------------------------------------------
    # METHOD B: SMART (Python Engine - Recommended)
    # ---------------------------------------------------------
    print("\n2. Running Method B (Python Engine, Robust Quotes)...")
    try:
        # Explicitly handling complex text fields
        df_b = pd.read_csv(
            FULL_PATH,
            sep=',',
            quotechar='"',
            engine='python',    # Handles newlines in text better
            on_bad_lines='warn' # Safety net
        )
        count_b = len(df_b)
        print(f"   -> Success. Rows: {count_b}")
    except Exception as e:
        count_b = "CRASHED"
        print(f"   -> :x: CRASHED: {e}")

    # ---------------------------------------------------------
    # VERDICT
    # ---------------------------------------------------------
    print("\n" + "="*40)

    if count_a == count_b:
        print("VERDICT: :white_check_mark: PERFECT MATCH.")
        print("Both methods retrieve exactly the same data.")
        print("Recommendation: Use Method B (engine='python') just to be safe with multiline text.")
    else:
        print("VERDICT: :warning: DIFFERENCE DETECTED.")
        print(f"Method A: {count_a}")
        print(f"Method B: {count_b}")
        print("Recommendation: Use Method B. Method A likely split rows incorrectly due to newlines.")

if __name__ == "__main__":
    compare_csv_loading()

--- CSV LOADING AUDIT: project_data.csv ---

1. Running Method A (Default C Engine)...
   -> Success. Rows: 105336

2. Running Method B (Python Engine, Robust Quotes)...
   -> Success. Rows: 105336

VERDICT: :white_check_mark: PERFECT MATCH.
Both methods retrieve exactly the same data.
Recommendation: Use Method B (engine='python') just to be safe with multiline text.


In [4]:
import pandas as pd
import os
import csv

# CONFIGURATION
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"
FILE_NAME = "project_data.csv"
FULL_PATH = os.path.join(DATA_PATH, FILE_NAME)

def test_strategies():
    print(f"--- CSV LOADING SHOWDOWN: {FILE_NAME} ---")

    if not os.path.exists(FULL_PATH):
        print(f":x: Error: File not found at {FULL_PATH}")
        return

    # ---------------------------------------------------------
    # CONFIG 1: THE SMART WAY (Respects Quotes)
    # This keeps "Cancer, Lung" as 1 column.
    # This is what you WANT if the file is well-formed.
    # ---------------------------------------------------------
    print("\n1. Running Config 1 (Smart/Strict Quotes)...")
    try:
        df_smart = pd.read_csv(
            FULL_PATH,
            sep=',',
            quotechar='"',
            escapechar='\\',
            engine='python',
            on_bad_lines='warn'
        )
        count_smart = len(df_smart)
        cols_smart = len(df_smart.columns)
        print(f"   -> Success. Rows: {count_smart:,} | Columns: {cols_smart}")
    except Exception as e:
        count_smart = "CRASHED"
        cols_smart = 0
        print(f"   -> :x: CRASHED: {e}")

    # ---------------------------------------------------------
    # CONFIG 2: THE BRUTE WAY (Ignores Quotes)
    # This splits "Cancer, Lung" into 2 columns.
    # If this loads MORE rows than Config 1, your file has broken quotes.
    # ---------------------------------------------------------
    print("\n2. Running Config 2 (Brute/Ignore Quotes)...")
    try:
        df_brute = pd.read_csv(
            FULL_PATH,
            sep=',',
            quoting=3, # csv.QUOTE_NONE
            engine='python',
            on_bad_lines='warn'
        )
        count_brute = len(df_brute)
        cols_brute = len(df_brute.columns)
        print(f"   -> Success. Rows: {count_brute:,} | Columns: {cols_brute}")
    except Exception as e:
        count_brute = "CRASHED"
        print(f"   -> :x: CRASHED: {e}")

    # ---------------------------------------------------------
    # THE VERDICT
    # ---------------------------------------------------------
    print("\n" + "="*40)
    print("VERDICT")
    print("="*40)

    if count_smart == "CRASHED":
        print("Winner: CONFIG 2 (Brute Force).")
        print("Reason: Your CSV has fatal quote errors that crash the standard loader.")
        print("Advice: Use the Brute Force loader, but beware of split text columns.")

    elif isinstance(count_smart, int) and isinstance(count_brute, int):
        if count_smart >= count_brute:
            print(f"Winner: CONFIG 1 (Smart/Strict).")
            print("Reason: It loaded the same (or more) data and correctly handled commas inside text.")
            print("Advice: This is the correct, scientific way to load your data.")

        elif count_brute > count_smart:
            diff = count_brute - count_smart
            print(f"Winner: CONFIG 2 (Brute Force) +{diff} rows.")
            print("Reason: Config 1 dropped rows due to unbalanced quotes. Config 2 rescued them.")
            print("Advice: Use Config 2 if you absolutely need those rows, but check your text columns!")

if __name__ == "__main__":
    test_strategies()

--- CSV LOADING SHOWDOWN: project_data.csv ---

1. Running Config 1 (Smart/Strict Quotes)...
   -> Success. Rows: 105,336 | Columns: 32

2. Running Config 2 (Brute/Ignore Quotes)...



  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(

  df_brute = pd.read_csv(



   -> Success. Rows: 51,979 | Columns: 32

VERDICT
Winner: CONFIG 1 (Smart/Strict).
Reason: It loaded the same (or more) data and correctly handled commas inside text.
Advice: This is the correct, scientific way to load your data.
