In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder


# ----- Load your data -----
df = pd.read_csv('global_cancer_patients_2015_2024.csv')

# Normalize year
df['Year'] = 2025 - df['Year']

# Encode the combined labels
label_encoder = LabelEncoder()
df['Cancer_Stage_Int'] = label_encoder.fit_transform(df['Cancer_Stage'])

# Save mapping
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))

# One-hot encode categorical features
categorical_cols = ['Gender','Cancer_Type']
X = pd.get_dummies(df, columns=categorical_cols)

# Drop original target columns
Y = X['Country_Region']
X = X.drop(['Patient_ID', 'Country_Region', 'Cancer_Stage'], axis=1)  # drop ID and target


# ----- Select numeric columns only -----
numeric_df = df.select_dtypes(include=[np.number])

# ========================================
# 🔹 1. Filtered Correlation Report
# ========================================
def find_high_correlations(data, threshold=0.8):
    corr_matrix = data.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    high_corr_pairs = (
        upper.stack()
        .reset_index()
        .rename(columns={0: 'correlation', 'level_0': 'feature_1', 'level_1': 'feature_2'})
    )
    
    return high_corr_pairs[high_corr_pairs['correlation'] > threshold].sort_values(by='correlation', ascending=False)

high_corrs = find_high_correlations(numeric_df, threshold=0.8)

print("\n🔗 Highly Correlated Feature Pairs (|r| > 0.8):")
print(high_corrs if not high_corrs.empty else "None found.")





🔗 Highly Correlated Feature Pairs (|r| > 0.8):
None found.


In [3]:
# ========================================
# 🔹 2. IQR Outlier Detection (with % reporting)
# ========================================
def detect_outliers_iqr(data):
    outlier_flags = pd.DataFrame(index=data.index)
    
    for col in data.columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        is_outlier = (data[col] < lower_bound) | (data[col] > upper_bound)
        outlier_flags[col + "_outlier_percentage"] = is_outlier
    
    outlier_percentages = outlier_flags.mean().sort_values(ascending=False) * 100
    return outlier_flags, outlier_percentages

outlier_flags, outlier_percentages = detect_outliers_iqr(numeric_df)

print("\n🚨 Percentage of Rows with Outliers (IQR Method):")
print(outlier_percentages[outlier_percentages > 0].round(2).astype(str) + " %")

# Optional: Flag rows with any outlier
numeric_df['any_outlier'] = outlier_flags.any(axis=1)

# Optional: View rows with any outlier
outlier_rows = df[numeric_df['any_outlier']]
print(f"\nFound {len(outlier_rows)} rows ({(len(outlier_rows) / len(df)) * 100:.2f}%) with at least one outlier.")


🚨 Percentage of Rows with Outliers (IQR Method):
Target_Severity_Score_outlier_percentage    0.3 %
dtype: object

Found 151 rows (0.30%) with at least one outlier.


In [4]:
# ========================================
# 🔹 3. Class Distribution
# ========================================
def display_class_distribution(data):
    dist = data.value_counts(normalize=True).mul(100).round(2)
    print(f"\n Class Distribution")
    print(dist.astype(str) + " %")

# ----- Replace 'target' with your actual target column name -----
display_class_distribution(Y)


 Class Distribution
Country_Region
Australia    10.18 %
USA          10.12 %
UK           10.12 %
India        10.08 %
Germany      10.05 %
Russia       10.03 %
Brazil       10.01 %
Pakistan      9.85 %
China         9.82 %
Canada        9.73 %
Name: proportion, dtype: object
