In [1]:
import pandas as pd
import csv
import os
# ---------------------------------------------------------
# 1. CONFIGURATION
# ---------------------------------------------------------
# PASTE YOUR PATH HERE.
# Windows example: r"C:\Users\Name\Documents\AACT"
# Mac/Linux example: "/Users/name/data/AACT"
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"
# ---------------------------------------------------------
# 2. ROBUST LOADER FUNCTION
# ---------------------------------------------------------
def load_aact_file(filename, folder_path):
    """
    Loads an AACT .txt file with robust parameters to handle
    parsing errors (unbalanced quotes) and pipe delimiters.
    """
    full_path = os.path.join(folder_path, filename)
    if not os.path.exists(full_path):
        print(f":x: Error: File not found at {full_path}")
        return None
    print(f":hourglass_flowing_sand: Loading {filename}...")
    try:
        df = pd.read_csv(
            full_path,
            sep='|',                 # AACT uses pipes, not commas
            quoting=3,               # csv.QUOTE_NONE: Ignores all quotes to prevent ParserErrors
            low_memory=False,        # Prevents mixed-type warnings on large files
            on_bad_lines='warn'      # Skips corrupted lines instead of crashing
        )
        print(f":white_check_mark: Success: {filename} loaded with {df.shape[0]:,} rows and {df.shape[1]} columns.")
        return df
    except Exception as e:
        print(f":x: Failed to load {filename}: {e}")
        return None
# ---------------------------------------------------------
# 3. EXECUTION
# ---------------------------------------------------------
# Load Studies (Target Variable is here: 'overall_status')
studies_df = load_aact_file('studies.txt', DATA_PATH)
# Load Interventions (Filter for 'Drug' is here: 'intervention_type')
interventions_df = load_aact_file('interventions.txt', DATA_PATH)
# ---------------------------------------------------------
# 4. QUICK INSPECTION
# ---------------------------------------------------------
if studies_df is not None:
    print("\n--- Studies Sample ---")
    # Check for the target column existence
    if 'overall_status' in studies_df.columns:
        print("Target column 'overall_status' found.")
    else:
        print("WARNING: 'overall_status' column missing. Check column names.")
if interventions_df is not None:
    print("\n--- Interventions Sample ---")
    # Check for intervention type
    if 'intervention_type' in interventions_df.columns:
        print(f"Intervention types present: {interventions_df['intervention_type'].unique()[:5]}")


:hourglass_flowing_sand: Loading studies.txt...


Skipping line 26917: expected 71 fields, saw 72
Skipping line 71531: expected 71 fields, saw 73
Skipping line 72332: expected 71 fields, saw 73
Skipping line 91121: expected 71 fields, saw 72
Skipping line 91997: expected 71 fields, saw 72
Skipping line 96504: expected 71 fields, saw 72
Skipping line 98819: expected 71 fields, saw 72
Skipping line 99685: expected 71 fields, saw 73
Skipping line 112093: expected 71 fields, saw 73
Skipping line 141292: expected 71 fields, saw 73
Skipping line 145008: expected 71 fields, saw 73
Skipping line 147915: expected 71 fields, saw 73
Skipping line 200367: expected 71 fields, saw 76
Skipping line 207631: expected 71 fields, saw 72
Skipping line 207734: expected 71 fields, saw 73
Skipping line 207742: expected 71 fields, saw 73
Skipping line 210198: expected 71 fields, saw 73
Skipping line 210646: expected 71 fields, saw 73
Skipping line 237825: expected 71 fields, saw 73
Skipping line 242137: expected 71 fields, saw 72
Skipping line 242709: expect

:white_check_mark: Success: studies.txt loaded with 558,918 rows and 71 columns.
:hourglass_flowing_sand: Loading interventions.txt...


Skipping line 37859: expected 5 fields, saw 6
Skipping line 37860: expected 5 fields, saw 6
Skipping line 76854: expected 5 fields, saw 6
Skipping line 81562: expected 5 fields, saw 7
Skipping line 92264: expected 5 fields, saw 6
Skipping line 141052: expected 5 fields, saw 6
Skipping line 146303: expected 5 fields, saw 6
Skipping line 155634: expected 5 fields, saw 7
Skipping line 187184: expected 5 fields, saw 6
Skipping line 187185: expected 5 fields, saw 6
Skipping line 189208: expected 5 fields, saw 6
Skipping line 203075: expected 5 fields, saw 7
Skipping line 205875: expected 5 fields, saw 10
Skipping line 205876: expected 5 fields, saw 10
Skipping line 230863: expected 5 fields, saw 6
Skipping line 230864: expected 5 fields, saw 6
Skipping line 230865: expected 5 fields, saw 6
Skipping line 238661: expected 5 fields, saw 6
Skipping line 273087: expected 5 fields, saw 6
Skipping line 288959: expected 5 fields, saw 7
Skipping line 288960: expected 5 fields, saw 7
Skipping line 28

:white_check_mark: Success: interventions.txt loaded with 945,797 rows and 5 columns.

--- Studies Sample ---
Target column 'overall_status' found.

--- Interventions Sample ---
Intervention types present: ['OTHER' 'DIAGNOSTIC_TEST' 'BEHAVIORAL' 'PROCEDURE' 'DRUG']


In [2]:
# 1. Check Studies Data
print("--- Studies Data Check ---")
print(studies_df[['nct_id', 'overall_status', 'phase']].head())
print("\nUnique Statuses found:", studies_df['overall_status'].unique()[:5])

# 2. Check Interventions Data
print("\n--- Interventions Data Check ---")
print(interventions_df[['nct_id', 'intervention_type', 'name']].head())

--- Studies Data Check ---
        nct_id overall_status   phase
0  NCT02113878      COMPLETED  PHASE1
1  NCT02531386      COMPLETED     NaN
2  NCT00385515      COMPLETED  PHASE2
3  NCT00000845      COMPLETED  PHASE1
4  NCT05233956     RECRUITING     NaN

Unique Statuses found: ['COMPLETED' 'RECRUITING' 'WITHDRAWN' 'TERMINATED' 'UNKNOWN']

--- Interventions Data Check ---
        nct_id intervention_type  \
0  NCT02782156             OTHER   
1  NCT04038827   DIAGNOSTIC_TEST   
2  NCT02413840        BEHAVIORAL   
3  NCT00230035         PROCEDURE   
4  NCT00230035         PROCEDURE   

                                                name  
0                                 Process evaluation  
1                                     D-CEC counting  
2                                   Baduanjin qigong  
3                                      Leukapheresis  
4  Non-myeloablative high dose immunosuppressive ...  
