In [1]:
# !pip install xgboost catboost

## **NOTE:** install `xgboost` python library into your ipynb notebook before running this. Else, you will face error.
*(You can uncomment the above py code block ONLY if you haven't installed it)*

In [2]:
# ──────── 1) Import Libraries ────────
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# ──────── 2) Load Data ────────
train_df = pd.read_csv("kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv")
test_df = pd.read_csv("kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")

# ──────── 3) Drop Unnecessary Column ────────
train_df.drop(columns=["Unnamed: 0"], inplace=True)
test_df.drop(columns=["Unnamed: 0"], inplace=True)

# ──────── 4) Prepare Data ────────
X = train_df.drop(columns=["ID", "class"])
y = train_df["class"]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

test_ids = test_df["ID"]
X_test = test_df.drop(columns=["ID"])

# ──────── 4.1) Identify NDVI Columns ────────
ndvi_cols = [col for col in X.columns if '_N' in col]

# ──────── 4.2) Feature Engineering Function ────────
def add_features(df):
    df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    df['ndvi_std'] = df[ndvi_cols].std(axis=1)
    df['ndvi_min'] = df[ndvi_cols].min(axis=1)
    df['ndvi_max'] = df[ndvi_cols].max(axis=1)
    df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']
    df['ndvi_median'] = df[ndvi_cols].median(axis=1)
    df['ndvi_q25'] = df[ndvi_cols].quantile(0.25, axis=1)
    df['ndvi_q75'] = df[ndvi_cols].quantile(0.75, axis=1)
    df['ndvi_iqr'] = df['ndvi_q75'] - df['ndvi_q25']
    df['ndvi_skew'] = df[ndvi_cols].apply(lambda x: skew(x.dropna()), axis=1)
    df['ndvi_kurtosis'] = df[ndvi_cols].apply(lambda x: kurtosis(x.dropna()), axis=1)
    df['ndvi_neg_count'] = (df[ndvi_cols] < 0).sum(axis=1)
    df['ndvi_zero_count'] = (df[ndvi_cols] == 0).sum(axis=1)
    df['ndvi_diff'] = df[ndvi_cols].iloc[:, -1] - df[ndvi_cols].iloc[:, 0]

    df['ndvi_mean_x_range'] = df['ndvi_mean'] * df['ndvi_range']
    df['ndvi_std_div_iqr'] = df['ndvi_std'] / (df['ndvi_iqr'] + 1e-5)

    def compute_slope(row):
        y_vals = row[ndvi_cols].values
        if np.any(np.isnan(y_vals)):
            return 0
        X_vals = np.arange(len(ndvi_cols)).reshape(-1, 1)
        y_vals = y_vals.reshape(-1, 1)
        model = LinearRegression().fit(X_vals, y_vals)
        return model.coef_[0][0]

    df['ndvi_trend'] = df.apply(compute_slope, axis=1)
    return df

# ──────── 4.3) Apply Feature Engineering ────────
X = add_features(X)
X_test = add_features(X_test)

# ──────── 5) Handle Missing Values ────────
X.fillna(X.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

# ──────── 6) Train/Test Split ────────
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ──────── 7) Ensure All Columns Are Numeric ────────
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_val = X_val.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

X_train.fillna(X_train.mean(), inplace=True)
X_val.fillna(X_val.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

# ──────── 8) Train XGBoost Model ────────
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=7,
    random_state=42,
    eval_metric='mlogloss'
)
model.fit(X_train.values, y_train)

# ──────── 9) Evaluate ────────
y_val_pred = model.predict(X_val.values)
val_acc = accuracy_score(y_val, y_val_pred)
print(f"📊 Validation Accuracy: {val_acc * 100:.2f}%")

# ──────── 10) Predict on Test Set ────────
y_test_pred = model.predict(X_test.values)
y_test_pred = label_encoder.inverse_transform(y_test_pred)

# ──────── 11) Save Submission ────────
submission = pd.DataFrame({
    "ID": test_ids,
    "class": y_test_pred
})
submission.to_csv("submission.csv", index=False)
print("✅ Submission saved at submission.csv")


📊 Validation Accuracy: 94.56%
✅ Submission saved at submission.csv
