# DOD Prohibited Substances Database Analysis

This notebook provides tools to load and analyze the DOD prohibited substances database. We'll explore the data structure, perform basic analysis, and create visualizations to understand the substance database better.

## 1. Import Required Libraries

First, let's import all the necessary libraries for data manipulation, analysis, and visualization.

In [None]:
# Import standard libraries for data manipulation and analysis
import pandas as pd
import json
import sqlite3
import os
import warnings
warnings.filterwarnings('ignore')

# Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 2. Load the Substance Database

Let's load the substance database from the available data sources. We'll try multiple approaches to load the data.

In [None]:
# Load the substance database from JSON file
def load_substance_database():
    """Load the substance database from available sources."""
    
    # Try loading from docs/data.json first
    json_path = "docs/data.json"
    if os.path.exists(json_path):
        print(f"Loading data from {json_path}...")
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        df = pd.DataFrame(data)
        print(f"✓ Successfully loaded {len(df)} substances from JSON file")
        return df
    
    # Try loading from SQLite database if exists
    db_path = "substances.db"
    if os.path.exists(db_path):
        print(f"Loading data from {db_path}...")
        conn = sqlite3.connect(db_path)
        df = pd.read_sql_query("SELECT * FROM substances", conn)
        conn.close()
        print(f"✓ Successfully loaded {len(df)} substances from SQLite database")
        return df
    
    # Try using the SubstanceDatabase class from generate_docs.py
    try:
        from generate_docs import SubstanceDatabase, Settings
        print("Loading data using SubstanceDatabase class...")
        settings = Settings()
        db = SubstanceDatabase(settings.db_file)
        # This will create the database if it doesn't exist
        print(f"✓ Database initialized at {settings.db_file}")
        return None  # Return None to indicate database exists but may be empty
    except ImportError as e:
        print(f"✗ Could not import database classes: {e}")
        return None

# Load the database
df = load_substance_database()
if df is not None:
    print(f"\nDataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
else:
    print("No data loaded - database may need to be populated first")

In [None]:
df["DISPLAY_NAME"] = df["Name"].str.upper()
df

In [None]:

from unii_client import UniiDataClient
client = UniiDataClient()
# client.get_data_info()

In [None]:
unii_df = client.load_csv_data('UNII_Records_18Aug2025.txt', sep='\t')

In [None]:
unii_df.merge(df, left_on='DISPLAY_NAME', right_on='DISPLAY_NAME')

In [None]:
def clean_substance_data(df):
    """Clean and preprocess the substance data."""
    if df is None or df.empty:
        print("No data to clean")
        return None
    
    print("=== DATA CLEANING ===")
    original_shape = df.shape
    
    # Create a copy to work with
    df_clean = df.copy()
    
    # Standardize column names (find main name column)
    name_column = None
    for col in ['Name', 'name', 'substance_name']:
        if col in df_clean.columns:
            name_column = col
            break
    
    if name_column and name_column != 'name':
        df_clean['name'] = df_clean[name_column]
        print(f"✓ Standardized name column from '{name_column}' to 'name'")
    
    # Parse JSON fields
    json_fields = ['other_names', 'classifications', 'Reasons', 'References']
    for field in json_fields:
        if field in df_clean.columns:
            try:
                # Parse JSON strings into actual lists/dicts
                df_clean[field + '_parsed'] = df_clean[field].apply(
                    lambda x: json.loads(x) if isinstance(x, str) and x.strip().startswith(('[', '{')) else x
                )
                print(f"✓ Parsed JSON field: {field}")
            except Exception as e:
                print(f"⚠ Could not parse {field}: {e}")
    
    # Clean text fields
    text_fields = ['name', 'Reason', 'reason', 'searchable_name', 'Searchable_name']
    for field in text_fields:
        if field in df_clean.columns:
            # Remove extra whitespace and handle nulls
            df_clean[field] = df_clean[field].astype(str).str.strip()
            df_clean[field] = df_clean[field].replace(['nan', 'None', ''], None)
    
    # Extract simplified reason categories
    reason_col = 'Reason' if 'Reason' in df_clean.columns else 'reason'
    if reason_col in df_clean.columns:
        df_clean['reason_category'] = df_clean[reason_col].apply(extract_reason_category)
        print("✓ Extracted reason categories")
    
    print(f"✓ Cleaning complete. Shape: {original_shape} → {df_clean.shape}")
    return df_clean

def extract_reason_category(reason_text):
    """Extract main category from reason text."""
    if not reason_text or pd.isna(reason_text):
        return 'Unknown'
    
    reason_lower = str(reason_text).lower()
    
    if 'schedule i' in reason_lower or 'schedule 1' in reason_lower:
        return 'Schedule I'
    elif 'schedule ii' in reason_lower or 'schedule 2' in reason_lower:
        return 'Schedule II'
    elif 'schedule iii' in reason_lower or 'schedule 3' in reason_lower:
        return 'Schedule III'
    elif 'wada' in reason_lower:
        return 'WADA Prohibited'
    elif 'dodi' in reason_lower:
        return 'DoD Policy'
    elif 'unapproved' in reason_lower:
        return 'Unapproved Drug'
    else:
        return 'Other'

# Clean the data
if df is not None:
    df_clean = clean_substance_data(df)
else:
    print("No data loaded to clean. Please run the data loading cell first.")

## 5. Basic Data Analysis

Let's perform some basic exploratory data analysis to understand the substance database.

In [None]:
# Basic analysis of the substance database
if 'df_clean' in globals() and df_clean is not None:
    print("=== BASIC STATISTICS ===")
    print(f"Total substances: {len(df_clean):,}")
    
    # Analyze reason categories
    if 'reason_category' in df_clean.columns:
        print("\n=== PROHIBITION CATEGORIES ===")
        reason_counts = df_clean['reason_category'].value_counts()
        for category, count in reason_counts.items():
            percentage = (count / len(df_clean)) * 100
            print(f"{category:<20}: {count:>5} substances ({percentage:5.1f}%)")
        
        # Create a simple visualization
        plt.figure(figsize=(10, 6))
        reason_counts.plot(kind='bar', color='steelblue', alpha=0.7)
        plt.title('Distribution of Substances by Prohibition Category')
        plt.xlabel('Category')
        plt.ylabel('Number of Substances')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    
    # Analyze missing data patterns
    print("\n=== MISSING DATA ANALYSIS ===")
    missing_data = df_clean.isnull().sum()
    missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
    
    if len(missing_data) > 0:
        print("Fields with missing data:")
        for field, count in missing_data.head(10).items():
            percentage = (count / len(df_clean)) * 100
            print(f"{field:<25}: {count:>5} missing ({percentage:5.1f}%)")
    else:
        print("No missing data found!")
    
    # Sample some interesting substances
    print("\n=== SAMPLE SUBSTANCES ===")
    if 'name' in df_clean.columns:
        sample_substances = df_clean['name'].dropna().sample(min(5, len(df_clean))).tolist()
        for i, substance in enumerate(sample_substances, 1):
            print(f"{i}. {substance}")
    
else:
    print("No cleaned data available. Please run the data cleaning cell first.")

## 6. Filter and Search Substances

Let's create some useful functions to filter and search through the substance database.

In [None]:
# Use the existing classes to work with substance data
def search_substances_by_name(df, search_term, limit=10):
    """Search for substances by name."""
    if df is None or df.empty:
        return pd.DataFrame()
    
    name_col = 'Name' if 'Name' in df.columns else 'name'
    if name_col not in df.columns:
        print("No name column found")
        return pd.DataFrame()
    
    # Case-insensitive search
    mask = df[name_col].str.contains(search_term, case=False, na=False)
    results = df[mask].head(limit)
    
    print(f"Found {len(results)} substances matching '{search_term}':")
    return results

def filter_by_category(df, category):
    """Filter substances by prohibition category."""
    if df is None or 'reason_category' not in df.columns:
        return pd.DataFrame()
    
    filtered = df[df['reason_category'] == category]
    print(f"Found {len(filtered)} substances in category '{category}'")
    return filtered

def get_substance_details(df, substance_name):
    """Get detailed information about a specific substance."""
    if df is None:
        return None
    
    name_col = 'Name' if 'Name' in df.columns else 'name'
    substance = df[df[name_col].str.contains(substance_name, case=False, na=False)]
    
    if substance.empty:
        print(f"No substance found matching '{substance_name}'")
        return None
    
    if len(substance) > 1:
        print(f"Multiple substances found matching '{substance_name}':")
        print(substance[name_col].tolist())
        return substance
    
    return substance.iloc[0]

# Example usage with the loaded data
if 'df_clean' in globals() and df_clean is not None:
    # Search for testosterone-related substances
    print("=== SEARCH EXAMPLE: Testosterone ===")
    testosterone_results = search_substances_by_name(df_clean, "testosterone", limit=5)
    if not testosterone_results.empty:
        name_col = 'Name' if 'Name' in testosterone_results.columns else 'name'
        for idx, row in testosterone_results.iterrows():
            print(f"- {row[name_col]}")
    
    # Filter by Schedule I substances
    print("\n=== FILTER EXAMPLE: Schedule I ===")
    schedule_i = filter_by_category(df_clean, "Schedule I")
    if not schedule_i.empty:
        print("Sample Schedule I substances:")
        name_col = 'Name' if 'Name' in schedule_i.columns else 'name'
        for substance in schedule_i[name_col].head(3):
            print(f"- {substance}")
else:
    print("No data available for search examples")

## 7. Using the Substance and SubstanceDatabase Classes

Let's use the existing classes we've already written to work with the database more effectively.

In [None]:
# Import and use the existing classes
try:
    from generate_docs import Substance, SubstanceDatabase, Settings
    from changelog import ChangeType, SubstanceChange, DateChanges
    
    print("✓ Successfully imported existing classes!")
    
    # Initialize the database using our existing infrastructure
    settings = Settings()
    db = SubstanceDatabase(settings.db_file)
    
    print(f"✓ Connected to database: {settings.db_file}")
    
    # Get substances using the database class
    def get_substances_from_db():
        """Get all substances from the database using our SubstanceDatabase class."""
        try:
            substances = db.get_all_substances()
            print(f"✓ Retrieved {len(substances)} substances from database")
            return substances
        except Exception as e:
            print(f"⚠ Error retrieving substances: {e}")
            return []
    
    def analyze_substance_object(substance: Substance):
        """Analyze a single Substance object."""
        print("\n=== SUBSTANCE ANALYSIS ===")
        print(f"Name: {substance.name}")
        print(f"Key: {substance.key}")
        print(f"Added: {substance.added_date}")
        print(f"Updated: {substance.updated_date}")
        
        # Get reason information
        reason = substance.data.get('Reason') or substance.data.get('reason', 'Not specified')
        print(f"Reason: {reason}")
        
        # Get other names if available
        other_names = substance.data.get('other_names')
        if other_names:
            try:
                import ast
                names_list = ast.literal_eval(other_names) if isinstance(other_names, str) else other_names
                if names_list:
                    print(f"Other names: {', '.join(names_list[:3])}{'...' if len(names_list) > 3 else ''}")
            except:
                pass
        
        return substance
    
    def compare_substances_demo(substances_list):
        """Demonstrate substance comparison functionality."""
        if len(substances_list) < 2:
            print("Need at least 2 substances for comparison demo")
            return
        
        print("\n=== SUBSTANCE COMPARISON DEMO ===")
        substance1 = substances_list[0]
        substance2 = substances_list[1]
        
        # Compare the substances
        changed_fields = substance1.compare_with(substance2)
        
        print(f"Comparing '{substance1.name}' with '{substance2.name}'")
        print(f"Changed fields: {changed_fields if changed_fields else 'None'}")
        
        return changed_fields
    
    # Get substances from database
    substances = get_substances_from_db()
    
    if substances:
        print("\n=== SAMPLE SUBSTANCE ANALYSIS ===")
        # Analyze first few substances
        for i, substance in enumerate(substances[:3], 1):
            print(f"\n--- Substance {i} ---")
            analyze_substance_object(substance)
        
        # Demo comparison
        if len(substances) >= 2:
            compare_substances_demo(substances)
    else:
        print("No substances found in database. You may need to run generate_docs.py first.")

except ImportError as e:
    print(f"⚠ Could not import classes: {e}")
    print("Make sure you're running this from the project directory")

## 8. Advanced Analysis with Substance Objects

Let's perform more advanced analysis using our Substance dataclass methods.

In [None]:
# Advanced analysis using Substance methods
def analyze_substance_timestamps(substances_list):
    """Analyze substance timestamps and modification patterns."""
    if not substances_list:
        return
    
    print("=== TIMESTAMP ANALYSIS ===")
    
    # Get timestamps for all substances
    timestamps = []
    for substance in substances_list:
        try:
            timestamp = substance.get_last_modified_timestamp()
            if timestamp > 0:
                timestamps.append(timestamp)
        except:
            pass
    
    if timestamps:
        timestamps.sort()
        from datetime import datetime
        
        print(f"Substances with timestamps: {len(timestamps)}")
        print(f"Earliest modification: {datetime.fromtimestamp(min(timestamps))}")
        print(f"Latest modification: {datetime.fromtimestamp(max(timestamps))}")
        
        # Show modification frequency by year
        years = [datetime.fromtimestamp(ts).year for ts in timestamps]
        year_counts = pd.Series(years).value_counts().sort_index()
        
        print("\nModifications by year:")
        for year, count in year_counts.items():
            print(f"  {year}: {count} substances")
    else:
        print("No timestamp data available")

def find_recently_modified_substances(substances_list, days_ago=30):
    """Find substances modified within the last N days."""
    if not substances_list:
        return []
    
    from datetime import timedelta
    threshold_timestamp = int((datetime.now() - timedelta(days=days_ago)).timestamp())
    
    recent_substances = []
    for substance in substances_list:
        if substance.was_modified_since(threshold_timestamp):
            recent_substances.append(substance)
    
    print(f"=== RECENTLY MODIFIED SUBSTANCES (last {days_ago} days) ===")
    print(f"Found {len(recent_substances)} recently modified substances")
    
    for substance in recent_substances[:10]:  # Show first 10
        mod_time = datetime.fromtimestamp(substance.get_last_modified_timestamp())
        print(f"- {substance.name} (modified: {mod_time.strftime('%Y-%m-%d')})")
    
    return recent_substances

def analyze_substance_sources(substances_list):
    """Analyze source dates and data sources."""
    print("=== SOURCE ANALYSIS ===")
    
    source_dates = []
    for substance in substances_list:
        source_date = substance.get_source_date()
        if source_date:
            source_dates.append(source_date)
    
    if source_dates:
        print(f"Substances with source dates: {len(source_dates)}")
        unique_dates = sorted(set(source_dates))
        print(f"Unique source dates: {len(unique_dates)}")
        
        if len(unique_dates) <= 10:
            print("Source dates:")
            for date in unique_dates:
                count = source_dates.count(date)
                print(f"  {date}: {count} substances")
        else:
            print(f"Date range: {unique_dates[0]} to {unique_dates[-1]}")
    else:
        print("No source date information available")

# Run advanced analysis if we have substances
if 'substances' in globals() and substances:
    analyze_substance_timestamps(substances)
    print("\n" + "="*50 + "\n")
    recent = find_recently_modified_substances(substances, days_ago=90)
    print("\n" + "="*50 + "\n")
    analyze_substance_sources(substances)
else:
    print("No substances available for advanced analysis")

## 9. Working with Changelog Data

Let's explore the changelog functionality using our existing changelog classes.

In [None]:
# Work with changelog data using our existing classes
def demonstrate_changelog_functionality():
    """Demonstrate how to work with changelog data."""
    try:
        from changelog import parse_existing_changelog_entries, ChangeType
        
        print("=== CHANGELOG FUNCTIONALITY DEMO ===")
        
        # Check if changelog file exists
        changelog_path = "docs/changelog.md"
        if os.path.exists(changelog_path):
            print(f"✓ Found changelog file: {changelog_path}")
            
            # Parse existing changelog entries
            parsed_changes = parse_existing_changelog_entries(changelog_path)
            
            print(f"✓ Parsed {len(parsed_changes.dates)} dates from changelog")
            
            # Analyze changelog data
            total_changes = 0
            change_type_counts = {change_type: 0 for change_type in ChangeType}
            
            for date_str, date_changes in parsed_changes.dates.items():
                date_total = len(date_changes.added) + len(date_changes.modified) + len(date_changes.removed)
                total_changes += date_total
                
                change_type_counts[ChangeType.ADDED] += len(date_changes.added)
                change_type_counts[ChangeType.MODIFIED] += len(date_changes.modified)
                change_type_counts[ChangeType.REMOVED] += len(date_changes.removed)
            
            print("\n=== CHANGELOG STATISTICS ===")
            print(f"Total changes recorded: {total_changes}")
            print(f"Added substances: {change_type_counts[ChangeType.ADDED]}")
            print(f"Modified substances: {change_type_counts[ChangeType.MODIFIED]}")
            print(f"Removed substances: {change_type_counts[ChangeType.REMOVED]}")
            
            # Show recent dates
            recent_dates = sorted(parsed_changes.dates.keys())[-5:]
            print("\nMost recent changelog dates:")
            for date in recent_dates:
                changes = parsed_changes.dates[date]
                total = len(changes.added) + len(changes.modified) + len(changes.removed)
                print(f"  {date}: {total} changes")
                
            return parsed_changes
        else:
            print(f"⚠ Changelog file not found: {changelog_path}")
            return None
            
    except Exception as e:
        print(f"⚠ Error working with changelog: {e}")
        return None

def create_sample_substance_change():
    """Create a sample SubstanceChange for demonstration."""
    try:
        sample_change = SubstanceChange(
            change_type=ChangeType.MODIFIED,
            substance_name="Sample Substance",
            changed_fields=["Reason", "Classifications"],
            details="Updated classification and reason"
        )
        
        print("=== SAMPLE SUBSTANCE CHANGE ===")
        print(f"Type: {sample_change.change_type.value}")
        print(f"Substance: {sample_change.substance_name}")
        print(f"Changed fields: {sample_change.changed_fields}")
        print(f"Details: {sample_change.details}")
        
        return sample_change
    except Exception as e:
        print(f"⚠ Error creating sample change: {e}")
        return None

# Run changelog analysis
changelog_data = demonstrate_changelog_functionality()
print("\n" + "="*50 + "\n")
sample_change = create_sample_substance_change()

## 10. Create Custom Analysis Functions

Let's create some custom analysis functions that leverage all our existing infrastructure.

In [None]:
# Custom analysis functions using our existing infrastructure
class SubstanceAnalyzer:
    """A helper class for analyzing substances using our existing infrastructure."""
    
    def __init__(self, db_file="substances.db"):
        """Initialize the analyzer with a database connection."""
        try:
            from generate_docs import SubstanceDatabase, Settings
            self.settings = Settings()
            self.db = SubstanceDatabase(db_file)
            self.substances = []
            print(f"✓ Initialized SubstanceAnalyzer with {db_file}")
        except Exception as e:
            print(f"⚠ Error initializing analyzer: {e}")
            self.db = None
    
    def load_substances(self):
        """Load all substances from the database."""
        if not self.db:
            return []
        
        try:
            self.substances = self.db.get_all_substances()
            print(f"✓ Loaded {len(self.substances)} substances")
            return self.substances
        except Exception as e:
            print(f"⚠ Error loading substances: {e}")
            return []
    
    def search_by_criteria(self, name_contains=None, reason_contains=None, limit=10):
        """Search substances by various criteria."""
        if not self.substances:
            self.load_substances()
        
        results = []
        for substance in self.substances:
            match = True
            
            if name_contains:
                if not substance.name or name_contains.lower() not in substance.name.lower():
                    match = False
            
            if reason_contains and match:
                reason = substance.data.get('Reason', '') or substance.data.get('reason', '')
                if reason_contains.lower() not in str(reason).lower():
                    match = False
            
            if match:
                results.append(substance)
                if len(results) >= limit:
                    break
        
        return results
    
    def analyze_field_usage(self, field_name):
        """Analyze how a specific field is used across substances."""
        if not self.substances:
            self.load_substances()
        
        field_values = []
        empty_count = 0
        
        for substance in self.substances:
            value = substance.data.get(field_name)
            if value and str(value).strip() and str(value) not in ['null', 'None']:
                field_values.append(str(value))
            else:
                empty_count += 1
        
        print(f"=== FIELD ANALYSIS: {field_name} ===")
        print(f"Total substances: {len(self.substances)}")
        print(f"Field populated: {len(field_values)} ({len(field_values)/len(self.substances)*100:.1f}%)")
        print(f"Field empty: {empty_count} ({empty_count/len(self.substances)*100:.1f}%)")
        
        if field_values:
            unique_values = len(set(field_values))
            print(f"Unique values: {unique_values}")
            
            # Show sample values
            sample_values = list(set(field_values))[:5]
            print(f"Sample values: {sample_values}")
        
        return field_values
    
    def compare_two_substances(self, name1, name2):
        """Compare two substances by name."""
        substance1 = self.find_by_name(name1)
        substance2 = self.find_by_name(name2)
        
        if not substance1 or not substance2:
            print("One or both substances not found")
            return None
        
        changed_fields = substance1.compare_with(substance2)
        
        print(f"=== COMPARISON: {substance1.name} vs {substance2.name} ===")
        if changed_fields:
            print(f"Different fields: {changed_fields}")
            for field in changed_fields:
                val1 = substance1.data.get(field, 'Not set')
                val2 = substance2.data.get(field, 'Not set')
                print(f"  {field}: '{val1}' vs '{val2}'")
        else:
            print("No differences found")
        
        return changed_fields
    
    def find_by_name(self, name_search):
        """Find a substance by partial name match."""
        for substance in self.substances:
            if substance.name and name_search.lower() in substance.name.lower():
                return substance
        return None

# Create and use the analyzer
analyzer = SubstanceAnalyzer()
substances = analyzer.load_substances()

if substances:
    print("\n=== CUSTOM ANALYSIS EXAMPLES ===")
    
    # Example 1: Search for steroid-related substances
    print("\n--- Search Example ---")
    steroid_results = analyzer.search_by_criteria(name_contains="steroid", limit=5)
    for substance in steroid_results:
        print(f"- {substance.name}")
    
    # Example 2: Analyze label_terms field usage
    print("\n--- Field Analysis Example ---")
    analyzer.analyze_field_usage("label_terms")
    
    # Example 3: Find substances with specific reasons
    print("\n--- Reason Filter Example ---")
    wada_results = analyzer.search_by_criteria(reason_contains="WADA", limit=3)
    for substance in wada_results:
        reason = substance.data.get('Reason', 'Not specified')
        print(f"- {substance.name}: {reason}")

else:
    print("No substances loaded for custom analysis")