In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

def clean_data(df):
    """Clean the dataset by handling missing values and special codes"""
    print("\n[INFO] Cleaning Dataset...")
    
    # Replace special codes with NaN 
    special_codes = [91, 93, 94, 97, 98, 99]
    df = df.replace(special_codes, np.nan)
    print("[SUCCESS] Replaced special codes with missing values")
    
    # Drop columns with excessive missing values
    missing_threshold = 0.3
    cols_to_drop = df.columns[df.isnull().mean() > missing_threshold].tolist()
    if cols_to_drop:
        print(f"[WARNING] Dropping {len(cols_to_drop)} features with >{missing_threshold*100}% missing values")
        print("Dropped features:", cols_to_drop)
    else:
        print("[SUCCESS] No features exceeded missing value threshold")
    df = df.drop(columns=cols_to_drop)
    
    return df, cols_to_drop

def prepare_data(df, dropped_columns):
    """Prepare the data for different types of analysis"""
    print("\n[INFO] Preparing Data for Analysis...")
    
    # Create binary 
    df['alcohol_use_binary'] = df['IRALCFY'].apply(lambda x: 1 if x in [1, 2, 3] else 0)
    print("[SUCCESS] Created binary target: Use (1) vs. No Use (0)")
    
    # Create multi-class target 
    df['alcohol_use_multi'] = df['IRALCFY'].apply(lambda x: {
        0: 0,  # No use
        1: 1,  # Light use
        2: 2,  # Moderate use
        3: 3   # Heavy use
    }.get(x, 0))
    print("[SUCCESS] Created multi-class target: No Use (0), Light (1), Moderate (2), Heavy (3)")
    
    
    df['alcohol_use_regression'] = df['IRALCFY']
    print("[SUCCESS] Prepared original values for regression analysis")
    
    # Define feature groups 
    feature_groups = {
        'demographic': ['HEALTH2', 'INCOME', 'IRSEX'],
        'family': ['PARHLPHW', 'PRLMTTV2'],
        'school': ['AVGGRADE', 'SCHFELT'],
        'peer': ['FRDMEVR2', 'FRDMJMON'],
        'risk_perception': ['STNDALC', 'STNDSMJ']
    }
    
    # Remove dropped columns
    for group in feature_groups:
        feature_groups[group] = [col for col in feature_groups[group] if col not in dropped_columns]
    
    # Combine all features
    all_features = []
    for group in feature_groups.values():
        all_features.extend(group)
    
    print("\n[FEATURES] Feature Groups:")
    for group_name, features in feature_groups.items():
        print(f"- {group_name.capitalize()}: {len(features)} features")
        if len(features) > 0:
            print(f"  Features: {', '.join(features)}")
    
    # Create feature matrix 
    X = df[all_features]
    
    # Handle missing values
    print("\n[INFO] Handling missing values...")
    # Impute with median
    X = X.fillna(X.median())
    print("[SUCCESS] Imputed missing values with median")
    
    y_binary = df['alcohol_use_binary']
    y_multi = df['alcohol_use_multi']
    y_regression = df['alcohol_use_regression']
    
    return X, y_binary, y_multi, y_regression, all_features

def load_data(filepath):
    """Load the dataset from CSV file"""
    print("\n[INFO] Loading Dataset...")
    df = pd.read_csv(filepath)
    print(f"[SUCCESS] Loaded {len(df)} rows and {len(df.columns)} columns")
    return df

def create_preprocessing_pipeline(numerical_features, categorical_features):
    """Create a preprocessing pipeline for numerical and categorical features"""
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor

def train_and_evaluate_models(X, y, task_type, preprocessor):
    """Train and evaluate models for a specific task"""
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define models
    if task_type == 'binary':
        models = {
            'Decision Tree': DecisionTreeClassifier(random_state=42),
            'Random Forest': RandomForestClassifier(random_state=42),
            'Gradient Boosting': GradientBoostingClassifier(random_state=42)
        }
    elif task_type == 'multi':
        models = {
            'Decision Tree': DecisionTreeClassifier(random_state=42),
            'Random Forest': RandomForestClassifier(random_state=42)
        }
    else:  # regression
        models = {
            'Decision Tree': DecisionTreeRegressor(random_state=42),
            'Gradient Boosting': GradientBoostingRegressor(random_state=42)
        }
    
    results = {}
    for name, model in models.items():
        
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        
        # Train model
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        # Calculate metrics
        if task_type in ['binary', 'multi']:
            results[name] = {
                'accuracy': accuracy_score(y_test, y_pred),
                'balanced_accuracy': balanced_accuracy_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred, average='weighted'),
                'recall': recall_score(y_test, y_pred, average='weighted'),
                'f1': f1_score(y_test, y_pred, average='weighted'),
                'params': model.get_params()
            }
        else:
            results[name] = {
                'mse': mean_squared_error(y_test, y_pred),
                'r2': r2_score(y_test, y_pred),
                'params': model.get_params()
            }
    
    return results

def plot_results(binary_results, multi_results, regression_results):
    """Create visualizations for model comparisons"""
    # Plot binary 
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 3, 1)
    models = list(binary_results.keys())
    scores = [results['balanced_accuracy'] for results in binary_results.values()]
    plt.bar(models, scores)
    plt.title('Binary Classification\nBalanced Accuracy')
    plt.xticks(rotation=45)
    plt.ylim(0, 1)
    
    # Plot multi-class 
    plt.subplot(1, 3, 2)
    models = list(multi_results.keys())
    scores = [results['balanced_accuracy'] for results in multi_results.values()]
    plt.bar(models, scores)
    plt.title('Multi-class Classification\nBalanced Accuracy')
    plt.xticks(rotation=45)
    plt.ylim(0, 1)
    
    # Plot regression 
    plt.subplot(1, 3, 3)
    models = list(regression_results.keys())
    scores = [results['r2'] for results in regression_results.values()]
    plt.bar(models, scores)
    plt.title('Regression\nR² Score')
    plt.xticks(rotation=45)
    plt.ylim(-1, 1)
    
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.close()

def main():
    """Main function to run the analysis pipeline."""
    print("\n" + "="*50)
    print("Youth Drug Use Analysis")
    print("="*50 + "\n")

    print("[INFO] Loading Dataset...")
    df = load_data('C:/Users/dogak/Downloads/youth_data.csv')
    df, dropped_columns = clean_data(df)
    X, y_binary, y_multi, y_regression, all_features = prepare_data(df, dropped_columns)
    
    # Print missing values
    print("\n[INFO] Missing Value Statistics:")
    missing_stats = X.isnull().sum()
    print("\nFeatures with missing values:")
    for col in missing_stats[missing_stats > 0].index:
        print(f"- {col}: {missing_stats[col]} missing values ({missing_stats[col]/len(X)*100:.1f}%)")
    
    # Identify numerical and categorical features
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()
    
    print("\n[FEATURES] Feature Types:")
    print(f"Numerical features: {len(numerical_features)}")
    print(f"Categorical features: {len(categorical_features)}")
    
   
    preprocessor = create_preprocessing_pipeline(numerical_features, categorical_features)
    
    # Train and evaluate models 
    print("\n" + "="*50)
    print("Binary Classification Analysis")
    print("="*50)
    binary_results = train_and_evaluate_models(X, y_binary, 'binary', preprocessor)
    
    print("\n" + "="*50)
    print("Multi-class Classification Analysis")
    print("="*50)
    multi_results = train_and_evaluate_models(X, y_multi, 'multi', preprocessor)
    
    print("\n" + "="*50)
    print("Regression Analysis")
    print("="*50)
    regression_results = train_and_evaluate_models(X, y_regression, 'regression', preprocessor)
    
    # Plot 
    plot_results(binary_results, multi_results, regression_results)
    
    # Print final summary
    print("\n" + "="*50)
    print("Final Results Summary")
    print("="*50)
    
    print("\nBinary Classification Results:")
    print("\nModel Comparison:")
    print("-" * 80)
    print(f"{'Model':<20} {'Accuracy':<10} {'Balanced Acc':<12} {'Precision':<10} {'Recall':<10} {'F1 Score':<10}")
    print("-" * 80)
    for name, result in binary_results.items():
        print(f"{name:<20} {result['accuracy']:.3f}      {result['balanced_accuracy']:.3f}       {result['precision']:.3f}      {result['recall']:.3f}      {result['f1']:.3f}")
    print("-" * 80)
    
    print("\nMulti-class Classification Results:")
    print("\nModel Comparison:")
    print("-" * 80)
    print(f"{'Model':<20} {'Accuracy':<10} {'Balanced Acc':<12} {'Precision':<10} {'Recall':<10} {'F1 Score':<10}")
    print("-" * 80)
    for name, result in multi_results.items():
        print(f"{name:<20} {result['accuracy']:.3f}      {result['balanced_accuracy']:.3f}       {result['precision']:.3f}      {result['recall']:.3f}      {result['f1']:.3f}")
    print("-" * 80)
    
    print("\nRegression Results:")
    print("\nModel Comparison:")
    print("-" * 50)
    print(f"{'Model':<20} {'MSE':<10} {'R² Score':<10}")
    print("-" * 50)
    for name, result in regression_results.items():
        print(f"{name:<20} {result['mse']:.3f}      {result['r2']:.3f}")
    print("-" * 50)
    
    # Print best parameters 
    print("\nBest Parameters for Each Model:")
    print("\nBinary Classification:")
    for name, result in binary_results.items():
        print(f"\n{name}:")
        for param, value in result['params'].items():
            print(f"- {param}: {value}")
    
    print("\nMulti-class Classification:")
    for name, result in multi_results.items():
        print(f"\n{name}:")
        for param, value in result['params'].items():
            print(f"- {param}: {value}")
    
    print("\nRegression:")
    for name, result in regression_results.items():
        print(f"\n{name}:")
        for param, value in result['params'].items():
            print(f"- {param}: {value}")
    
    print("\n" + "="*50)
    print("Analysis Complete!")
    print("="*50)

if __name__ == "__main__":
    main() 


Youth Drug Use Analysis

[INFO] Loading Dataset...

[INFO] Loading Dataset...
[SUCCESS] Loaded 10561 rows and 79 columns

[INFO] Cleaning Dataset...
[SUCCESS] Replaced special codes with missing values
Dropped features: ['IRCIGFM', 'IRSMKLSS30N', 'IRALCFM', 'IRMJFM']

[INFO] Preparing Data for Analysis...
[SUCCESS] Created binary target: Use (1) vs. No Use (0)
[SUCCESS] Created multi-class target: No Use (0), Light (1), Moderate (2), Heavy (3)
[SUCCESS] Prepared original values for regression analysis

[FEATURES] Feature Groups:
- Demographic: 3 features
  Features: HEALTH2, INCOME, IRSEX
- Family: 2 features
  Features: PARHLPHW, PRLMTTV2
- School: 2 features
  Features: AVGGRADE, SCHFELT
- Peer: 2 features
  Features: FRDMEVR2, FRDMJMON
- Risk_perception: 2 features
  Features: STNDALC, STNDSMJ

[INFO] Handling missing values...
[SUCCESS] Imputed missing values with median

[INFO] Missing Value Statistics:

Features with missing values:

[FEATURES] Feature Types:
Numerical features:

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Multi-class Classification Analysis

Regression Analysis

Final Results Summary

Binary Classification Results:

Model Comparison:
--------------------------------------------------------------------------------
Model                Accuracy   Balanced Acc Precision  Recall     F1 Score  
--------------------------------------------------------------------------------
Decision Tree        0.916      0.509       0.874      0.916      0.892
Random Forest        0.923      0.503       0.872      0.923      0.894
Gradient Boosting    0.929      0.500       0.863      0.929      0.895
--------------------------------------------------------------------------------

Multi-class Classification Results:

Model Comparison:
--------------------------------------------------------------------------------
Model                Accuracy   Balanced Acc Precision  Recall     F1 Score  
--------------------------------------------------------------------------------
Decision Tree        0.914      0.2