In [None]:
# ============================================
# FAST + CLEAN VERSION (Use in Jupyter Notebook)
# ============================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

# ------------------------------
# 1. Load Dataset
# ------------------------------
df = pd.read_csv("/content/covtype.csv")
print("Initial Shape:", df.shape)

# ----------------------------------------------------
# 2. OUTLIER REMOVAL (IQR method for continuous cols)
# ----------------------------------------------------
def remove_outliers_iqr(data, cols):
    clean_data = data.copy()
    for col in cols:
        Q1 = clean_data[col].quantile(0.25)
        Q3 = clean_data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        clean_data = clean_data[(clean_data[col] >= lower) & (clean_data[col] <= upper)]
    return clean_data

continuous_cols = ["Elevation", "Aspect", "Slope"]
df = remove_outliers_iqr(df, continuous_cols)
print("After Outlier Removal:", df.shape)

# ------------------------------
# 3. DATASET STATISTICS
# ------------------------------
print("\n==== BASIC STATISTICS ====")
print(df.describe())

print("\n==== SKEWNESS BEFORE ====")
print(df[continuous_cols].skew())

# ----------------------------------------------------
# 4. SKEWNESS REDUCTION (Log1p transform)
# ----------------------------------------------------
for col in continuous_cols:
    df[col] = np.log1p(df[col])    # apply and save

print("\n==== SKEWNESS AFTER ====")
print(df[continuous_cols].skew())

# ------------------------------
# 5. Correlation Matrix
# ------------------------------
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.show()

# ------------------------------
# 6. Target distribution
# ------------------------------
sns.countplot(x=df["Cover_Type"])
plt.title("Cover Type Distribution")
plt.show()

# ------------------------------
# 7. Trainâ€“Test Split (using cleaned + log-transformed data)
# ------------------------------
X = df.drop("Cover_Type", axis=1)
y = df["Cover_Type"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------------
# 8. Scaling
# ------------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ------------------------------
# 9. MODELS (FAST VERSIONS)
# ------------------------------

# Logistic Regression
lr = LogisticRegression(max_iter=500, solver='lbfgs')
lr.fit(X_train, y_train)
acc_lr = accuracy_score(y_test, lr.predict(X_test))

# Linear SVM (fastest)
svm = LinearSVC()
svm.fit(X_train, y_train)
acc_svm = accuracy_score(y_test, svm.predict(X_test))

# MLP Neural Network
mlp = MLPClassifier(hidden_layer_sizes=(32, 16), max_iter=100, random_state=42)
mlp.fit(X_train, y_train)
acc_mlp = accuracy_score(y_test, mlp.predict(X_test))

# ------------------------------
# 10. Compare Results
# ------------------------------
scores = {
    "Logistic Regression": acc_lr,
    "Linear SVM": acc_svm,
    "MLP Neural Network": acc_mlp
}

plt.figure(figsize=(8, 4))
plt.bar(scores.keys(), scores.values())
plt.title("Model Accuracy Comparison (AFTER CLEANING & DESKEW)")
plt.ylabel("Accuracy")
plt.show()

print("\n=== FINAL ACCURACIES ===")
print(scores)