In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings("ignore")
plt.style.use('seaborn-v0_8')


# def preprocess_data_pipeline(df: pd.DataFrame) -> pd.DataFrame:
#     # 1. Define Numerical Columns
#     # Target: 'smoking' is excluded. Binary/Ordinal (hearing, Urine protein, dental caries) are also excluded.
#     numerical_cols = [
#         'age', 'height(cm)', 'weight(kg)', 'waist(cm)',
#         'eyesight(left)', 'eyesight(right)', 'systolic', 'relaxation',
#         'fasting blood sugar', 'Cholesterol', 'triglyceride', 'HDL', 'LDL',
#         'hemoglobin', 'serum creatinine', 'AST', 'ALT', 'Gtp'
#     ]

#     df_processed = df.copy()

#     # 2. Outlier Removal (IQR Capping)
#     for col in numerical_cols:
#         Q1 = df_processed[col].quantile(0.25)
#         Q3 = df_processed[col].quantile(0.75)
#         IQR = Q3 - Q1
#         lower_bound = Q1 - 1.5 * IQR
#         upper_bound = Q3 + 1.5 * IQR

#         # Cap the values to the calculated bounds
#         df_processed[col] = np.where(
#             df_processed[col] < lower_bound,
#             lower_bound,
#             df_processed[col]
#         )
#         df_processed[col] = np.where(
#             df_processed[col] > upper_bound,
#             upper_bound,
#             df_processed[col]
#         )

#     # 3. Skewness Reduction (Log Transformation)
#     # Recalculate skewness after outlier treatment for a precise transformation
#     skewness_after_outliers = df_processed[numerical_cols].skew()
#     # Apply np.log1p to columns with absolute skewness > 0.75
#     highly_skewed_cols = skewness_after_outliers[abs(skewness_after_outliers) > 0.75].index.tolist()

#     for col in highly_skewed_cols:
#         # np.log1p (log(1+x)) is used for positive skewness and to handle values close to zero.
#         df_processed[col] = np.log1p(df_processed[col])

#     return df_processed

def preprocess_data_drop_outliers(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies outlier removal (Dropping) and skewness reduction (np.log1p)
    to numerical columns in the DataFrame.

    Rows where at least one numerical feature is an outlier (based on
    1.5 * IQR) are removed entirely.

    Args:
        df: The input DataFrame.

    Returns:
        The processed DataFrame with outliers dropped and features transformed.
    """
    # 1. Define Numerical Columns
    numerical_cols = [
        'age', 'height(cm)', 'weight(kg)', 'waist(cm)',
        'eyesight(left)', 'eyesight(right)', 'systolic', 'relaxation',
        'fasting blood sugar', 'Cholesterol', 'triglyceride', 'HDL', 'LDL',
        'hemoglobin', 'serum creatinine', 'AST', 'ALT', 'Gtp'
    ]

    df_processed = df.copy()
    
    # 2. Outlier Removal (Dropping rows)
    # Initialize mask to True for all rows
    outlier_mask = pd.Series(True, index=df_processed.index)
    
    for col in numerical_cols:
        Q1 = df_processed[col].quantile(0.25)
        Q3 = df_processed[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Create mask for values within bounds for the current column
        col_mask = (df_processed[col] >= lower_bound) & (df_processed[col] <= upper_bound)
        
        # Combine the column mask with the overall mask using logical AND
        # This ensures only rows where ALL numerical columns are NOT outliers are kept.
        outlier_mask = outlier_mask & col_mask

    # Filter the DataFrame to keep only non-outlier rows
    df_processed = df_processed[outlier_mask]

    # 3. Skewness Reduction (Log Transformation)
    # Recalculate skewness on the cleaned data
    skewness_after_outliers = df_processed[numerical_cols].skew()
    # Apply np.log1p to columns with absolute skewness > 0.75
    highly_skewed_cols = skewness_after_outliers[abs(skewness_after_outliers) > 0.75].index.tolist()
    
    for col in highly_skewed_cols:
        df_processed[col] = np.log1p(df_processed[col])

    return df_processed

train_df = pd.read_csv("/content/train_dataset.csv")
#test_df = pd.read_csv("/content/test_dataset.csv")

print("Train shape:", train_df.shape)
#print("Test shape:", test_df.shape)

train_df.head()


train_df.info()
train_df.describe()

plt.figure(figsize=(6,4))
train_df['smoking'].value_counts().plot(kind='bar')
plt.title("Class Balance in Training Data")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()


plt.figure(figsize=(10,6))
sns.heatmap(train_df.corr(), cmap='coolwarm', annot=False)
plt.title("Correlation Heatmap")
plt.show()


train_df = preprocess_data_drop_outliers(train_df)

train_df.info()
train_df.describe()

plt.figure(figsize=(6,4))
train_df['smoking'].value_counts().plot(kind='bar')
plt.title("Class Balance in Training Data")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()


plt.figure(figsize=(10,6))
sns.heatmap(train_df.corr(), cmap='coolwarm', annot=False)
plt.title("Correlation Heatmap")
plt.show()


# X_train = train_df.drop(columns=['smoking'])
# y_train = train_df['smoking']

# X_test = test_df.drop(columns=['smoking'])
# y_test = test_df['smoking']
# print(X_train.shape, X_test.shape)


X = train_df.drop(columns=['smoking'])
y = train_df['smoking']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape, X_test.shape)



scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaling complete.")


# Logistic Regression
log_reg = LogisticRegression(
    # C=1.0,        # placeholder
    # max_iter=300
)

# SVM
svm_model = SVC(
    # C=1.0,        # placeholder
     kernel='rbf', # placeholder
     probability=True
)

# Neural Network
mlp = MLPClassifier(
    hidden_layer_sizes=(128, 128, 32),  # placeholder
    # activation='relu',            # placeholder
    # learning_rate_init=0.001,     # placeholder
    max_iter=1000,
    random_state=42
)


log_reg.fit(X_train_scaled, y_train)
svm_model.fit(X_train_scaled, y_train)
mlp.fit(X_train_scaled, y_train)

print("Training complete.")


models = {
    "Logistic Regression": log_reg,
    "SVM": svm_model,
    "Neural Network (MLP)": mlp
}

results = {"Model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1 Score": []}

for name, model in models.items():
    preds = model.predict(X_test_scaled)

    results["Model"].append(name)
    results["Accuracy"].append(accuracy_score(y_test, preds))
    results["Precision"].append(precision_score(y_test, preds))
    results["Recall"].append(recall_score(y_test, preds))
    results["F1 Score"].append(f1_score(y_test, preds))

results_df = pd.DataFrame(results)
results_df


plt.figure(figsize=(10,5))
plt.bar(results_df['Model'], results_df['Accuracy'])
plt.title("Accuracy Comparison")
plt.ylabel("Accuracy")
plt.show()

plt.figure(figsize=(10,5))
plt.bar(results_df['Model'], results_df['F1 Score'])
plt.title("F1 Score Comparison")
plt.ylabel("F1 Score")
plt.show()


best_model_name = results_df.sort_values("Accuracy", ascending=False).iloc[0]["Model"]
best_model = models[best_model_name]

print("Best Model:", best_model_name)

best_preds = best_model.predict(X_test_scaled)

cm = confusion_matrix(y_test, best_preds)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
plt.title(f"Confusion Matrix - {best_model_name}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


inference_df = pd.read_csv("/content/test_dataset.csv")

print("Inference input shape:", inference_df.shape)
inference_df.head()


inference_scaled = scaler.transform(inference_df)

inference_preds = best_model.predict(inference_scaled)

output_df = pd.DataFrame({
    "Input_Index": inference_df.index,
    "Predicted_Smoking": inference_preds
})

output_df
