# Brain Tumor Analysis Complete Notebook
This notebook contains the complete analysis pipeline with proper handling of missing values.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Data Loading and Initial Check

In [None]:
# Load data
print("Loading data...")
df = pd.read_csv('Brain_Tumor_Prediction_Dataset.csv')

# Display initial information
print("\nDataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())

# Check missing values
print("\nMissing Values:")
print(df.isnull().sum())

## 2. Data Cleaning Functions

In [None]:
def clean_data(df):
    """Clean the data by handling missing values and outliers"""
    df = df.copy()
    
    print("Initial Missing Values:")
    print(df.isnull().sum())
    
    # Fill missing values in categorical columns
    categorical_cols = ['Treatment_Received', 'Gender', 'Brain_Tumor_Present']
    for col in categorical_cols:
        if df[col].isnull().any():
            mode_value = df[col].mode()[0]
            df[col].fillna(mode_value, inplace=True)
            print(f"\nFilled {col} missing values with mode: {mode_value}")
    
    # Handle missing values in numerical columns
    numeric_cols = ['Age', 'Tumor_Size', 'Genetic_Risk', 'Survival_Rate(%)']
    
    for col in numeric_cols:
        print(f"\nColumn: {col}")
        print(f"Missing values: {df[col].isnull().sum()}")
        
        if df[col].isnull().any():
            median_value = df[col].median()
            df[col].fillna(median_value, inplace=True)
            print(f"Filled missing values with median: {median_value:.2f}")
        
        # Handle outliers using IQR
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Count outliers
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        print(f"Found {len(outliers)} outliers")
        
        # Clip outliers
        df[col] = df[col].clip(lower_bound, upper_bound)
    
    # Final check for missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print("\nWarning: Still have missing values:")
        print(missing[missing > 0])
    else:
        print("\nSuccess: No missing values remain in the dataset!")
    
    return df

## 3. Feature Engineering

In [None]:
def engineer_features(df):
    """Create advanced features"""
    df = df.copy()
    
    # Risk Score (weighted combination of risk factors)
    df['Risk_Score'] = (
        df['Genetic_Risk'] * 0.4 +
        df['Age'].clip(0, 100) / 100 * 0.3 +
        df['Tumor_Size'] / df['Tumor_Size'].max() * 0.3
    )
    
    # Medical complexity (count of high-risk factors)
    df['Medical_Complexity'] = df.apply(lambda x: sum([
        x['Genetic_Risk'] > 7,  # High genetic risk
        x['Age'] > 50,          # Advanced age
        x['Tumor_Size'] > 3,    # Large tumor
        x['Survival_Rate(%)'] < 50  # Low survival rate
    ]), axis=1)
    
    return df

## 4. Prepare Features for Machine Learning

In [None]:
def prepare_features(df):
    """Prepare features for ML"""
    df = df.copy()
    
    # Convert Gender to numeric
    gender_map = {'Female': 0, 'Male': 1, 'Other': 2}
    df['Gender'] = df['Gender'].map(gender_map)
    
    # Select features
    feature_cols = ['Age', 'Gender', 'Tumor_Size', 'Genetic_Risk', 'Survival_Rate(%)',
                   'Risk_Score', 'Medical_Complexity']
    
    # Create X and y
    X = df[feature_cols]
    y = df['Brain_Tumor_Present'].map({'Yes': 1, 'No': 0})
    
    return X, y

## 5. Run Complete Analysis

In [None]:
# Clean data
print("Cleaning and preprocessing data...")
df_cleaned = clean_data(df)

# Engineer features
print("\nEngineering features...")
df_processed = engineer_features(df_cleaned)

# Prepare features
print("\nPreparing features...")
X, y = prepare_features(df_processed)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train models
print("\nTraining models...")
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Print results
    print(f"\nResults for {name}:")
    print(classification_report(y_test, y_pred))
    
    # For Random Forest, show feature importance
    if name == 'Random Forest':
        importance = pd.DataFrame({
            'Feature': X.columns,
            'Importance': model.feature_importances_
        }).sort_values('Importance', ascending=False)
        print("\nFeature Importance:")
        print(importance)
        
        # Plot feature importance
        plt.figure(figsize=(10, 6))
        sns.barplot(data=importance, x='Importance', y='Feature')
        plt.title('Feature Importance in Random Forest Model')
        plt.show()