In [None]:
#IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

#LOAD DATASET
filename = 'titanic.csv'
df = pd.read_csv(filename)
print("✅ Dataset Loaded Successfully!")
print("Shape:", df.shape)
df.head()
print("\nMissing Values Summary:\n", df.isnull().sum().sort_values(ascending=False).head(10))
print("\nData Types:\n", df.dtypes)

#DATA CLEANING

# Handle missing values
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute numeric columns with median
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Impute categorical columns with most frequent (mode)
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Remove duplicates
duplicates = df.duplicated().sum()
if duplicates > 0:
    df = df.drop_duplicates()
print("\nDuplicates removed:", duplicates)

# Outlier detection and treatment (IQR method)
def cap_outliers(series):
    Q1, Q3 = series.quantile(0.25), series.quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    return np.clip(series, lower, upper)

for col in num_cols:
    df[col] = cap_outliers(df[col])

print("✅ Data Cleaning Complete")
print("Remaining Missing Values:", df.isnull().sum().sum())


#DATA INTEGRATION

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
print("✅ Column names standardized")

#DATA TRANSFORMATION (UPDATED & FIXED)

# Re-identify categorical and numeric columns (after cleaning)
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical columns before encoding:", cat_cols)
print("Numeric columns before scaling:", num_cols)

# Label encode simple (low-cardinality) categorical columns
from sklearn.preprocessing import LabelEncoder, StandardScaler
le = LabelEncoder()

for col in cat_cols:
    try:
        if df[col].nunique() <= 10:
            df[col + "_LE"] = le.fit_transform(df[col].astype(str))
    except Exception as e:
        print(f"⚠️ Skipped {col} due to error: {e}")

# One-hot encode remaining categorical columns safely
multi_cat_cols = [col for col in cat_cols if df[col].nunique() > 10 and col in df.columns]
if multi_cat_cols:
    df = pd.get_dummies(df, columns=multi_cat_cols, drop_first=True)
    print("One-hot encoded columns:", multi_cat_cols)
else:
    print("No high-cardinality categorical columns for one-hot encoding.")

# Scale numerical columns
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Log transform skewed numeric columns
for col in num_cols:
    if (df[col] > 0).all() and abs(df[col].skew()) > 1:
        df[col + "_log"] = np.log1p(df[col])

print("✅ Data Transformation Completed Successfully!")
print("New shape:", df.shape)


#DATA REDUCTION (FIXED)

# Drop irrelevant ID or text-heavy columns
id_cols = [c for c in df.columns if any(x in c.lower() for x in ['id', 'name', 'ticket', 'cabin'])]
df.drop(columns=id_cols, inplace=True, errors='ignore')

print("Dropped irrelevant columns (if present):", id_cols)

# Keep only numeric columns for correlation
numeric_df = df.select_dtypes(include=[np.number])

# Correlation-based feature reduction
corr_matrix = numeric_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df.drop(columns=to_drop, inplace=True, errors='ignore')
print("Dropped highly correlated features:", to_drop)

# Optional PCA (retain 95% variance)
try:
    pca = PCA(n_components=0.95, random_state=42)
    pca_transformed = pca.fit_transform(numeric_df)
    print("✅ PCA applied successfully.")
    print("Original numeric shape:", numeric_df.shape)
    print("Reduced shape after PCA:", pca_transformed.shape)
except Exception as e:
    print("⚠️ PCA skipped due to:", e)

print("✅ Data Reduction Complete")
print("Final dataset shape:", df.shape)