In [1]:
import pandas as pd
import os
import csv

# -----------------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------------
# Update this path to your actual data folder
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"

LOAD_PARAMS = {
    "sep": "|",
    "dtype": str,
    "header": 0,
    "quotechar": '"',
    "quoting": csv.QUOTE_MINIMAL,
    "low_memory": False,
    "on_bad_lines": "warn" # This is the key: it drops bad lines
}

def get_studies(include_problem_col=True):
    """
    Loads studies.txt.
    If include_problem_col=True, it mimics the Notebook (Strict).
    If include_problem_col=False, it mimics the Old Python Script (Lenient).
    """
    cols = [
        'nct_id', 'overall_status', 'study_type', 'phase',
        'start_date', 'number_of_arms', 'official_title', 'why_stopped'
    ]

    if include_problem_col:
        cols.append('start_date_type') # The "Bad Line" Trigger
        print(">>> Loading STRICT version (with start_date_type)...")
    else:
        print(">>> Loading LENIENT version (without start_date_type)...")

    path = os.path.join(DATA_PATH, 'studies.txt')
    df = pd.read_csv(path, usecols=cols, **LOAD_PARAMS)

    # Apply Basic Filter (Interventional) to match your pipeline scope
    df = df[df['study_type'] == 'INTERVENTIONAL']

    return df

# -----------------------------------------------------------------------------
# EXECUTION
# -----------------------------------------------------------------------------

# 1. Load Both Versions
df_strict = get_studies(include_problem_col=True)
df_lenient = get_studies(include_problem_col=False)

print(f"\nStrict Count (Notebook): {len(df_strict)}")
print(f"Lenient Count (Old Python): {len(df_lenient)}")

# 2. Identify the "Lost" Rows
# These are rows that exist in Lenient but were dropped in Strict
strict_ids = set(df_strict['nct_id'])
lost_rows = df_lenient[~df_lenient['nct_id'].isin(strict_ids)].copy()

print(f"\n>>> Found {len(lost_rows)} rows that are being dropped in the Notebook/Strict version.")

# 3. Analyze the Lost Rows
if len(lost_rows) > 0:
    print("\n--- ANALYSIS OF LOST ROWS ---")

    # A. Status Distribution
    print("\n1. Status Distribution of Lost Rows:")
    print(lost_rows['overall_status'].value_counts().head())

    # B. Date Distribution
    lost_rows['start_date'] = pd.to_datetime(lost_rows['start_date'], errors='coerce')
    print("\n2. Start Year Distribution:")
    print(lost_rows['start_date'].dt.year.value_counts().sort_index().head(5))
    print("...")
    print(lost_rows['start_date'].dt.year.value_counts().sort_index().tail(5))

    # C. Visual Inspection (The "Smell Test")
    print("\n3. Sample of Lost Rows (Check 'official_title' for corruption):")
    pd.set_option('display.max_colwidth', 100)
    print(lost_rows[['nct_id', 'start_date', 'overall_status', 'official_title']].head(10))

    # D. Check for obvious corruption
    # Often bad lines have titles that look like they were cut off or merged
    print("\n4. Checking for Suspicious Titles (containing pipes '|' or quotes):")
    suspicious = lost_rows[lost_rows['official_title'].str.contains(r'\||"', regex=True, na=False)]
    print(f"   Found {len(suspicious)} rows with suspicious characters in the title.")
    if not suspicious.empty:
        print(suspicious[['nct_id', 'official_title']].head(3))

else:
    print("No difference found! The datasets are identical.")

>>> Loading STRICT version (with start_date_type)...
>>> Loading LENIENT version (without start_date_type)...

Strict Count (Notebook): 426907
Lenient Count (Old Python): 426907

>>> Found 0 rows that are being dropped in the Notebook/Strict version.
No difference found! The datasets are identical.


In [2]:
import pandas as pd
import os
import csv

DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"
LOAD_PARAMS = {"sep": "|", "dtype": str, "header": 0, "quotechar": '"', "quoting": csv.QUOTE_MINIMAL, "low_memory": False, "on_bad_lines": "warn"}

# 1. Load With Name (Old Python Style)
df_strict = pd.read_csv(os.path.join(DATA_PATH, 'interventions.txt'), usecols=['nct_id', 'intervention_type', 'name'], **LOAD_PARAMS)

# 2. Load Without Name (Notebook/New Python Style)
df_lenient = pd.read_csv(os.path.join(DATA_PATH, 'interventions.txt'), usecols=['nct_id', 'intervention_type'], **LOAD_PARAMS)

print(f"With Name:    {len(df_strict)}")
print(f"Without Name: {len(df_lenient)}")

if len(df_strict) == len(df_lenient):
    print(">>> CONFIRMED: No rows lost in interventions.txt. The difference was definitely the Year Filter.")
else:
    print(f">>> FOUND DIFFERENCE: {len(df_lenient) - len(df_strict)} rows dropped due to bad names.")

With Name:    945857
Without Name: 945857
>>> CONFIRMED: No rows lost in interventions.txt. The difference was definitely the Year Filter.
