In [3]:
# â”€â”€â”€â”€â”€â”€â”€â”€ 1) Import Libraries â”€â”€â”€â”€â”€â”€â”€â”€
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# â”€â”€â”€â”€â”€â”€â”€â”€ 2) Load Data â”€â”€â”€â”€â”€â”€â”€â”€
train_df = pd.read_csv("kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv")
test_df = pd.read_csv("kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")

# â”€â”€â”€â”€â”€â”€â”€â”€ 3) Drop Unnecessary Column â”€â”€â”€â”€â”€â”€â”€â”€
train_df.drop(columns=["Unnamed: 0"], inplace=True)
test_df.drop(columns=["Unnamed: 0"], inplace=True)

# â”€â”€â”€â”€â”€â”€â”€â”€ 4) Prepare Data â”€â”€â”€â”€â”€â”€â”€â”€
X = train_df.drop(columns=["ID", "class"])
y = train_df["class"]

test_ids = test_df["ID"]
X_test = test_df.drop(columns=["ID"])

# â”€â”€â”€â”€â”€â”€â”€â”€ 4.1) Identify NDVI Columns â”€â”€â”€â”€â”€â”€â”€â”€
ndvi_cols = [col for col in X.columns if '_N' in col]

# â”€â”€â”€â”€â”€â”€â”€â”€ 4.2) Feature Engineering Function â”€â”€â”€â”€â”€â”€â”€â”€
def add_features(df):
    df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    df['ndvi_std'] = df[ndvi_cols].std(axis=1)
    df['ndvi_min'] = df[ndvi_cols].min(axis=1)
    df['ndvi_max'] = df[ndvi_cols].max(axis=1)
    df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']
    df['ndvi_median'] = df[ndvi_cols].median(axis=1)
    df['ndvi_q25'] = df[ndvi_cols].quantile(0.25, axis=1)
    df['ndvi_q75'] = df[ndvi_cols].quantile(0.75, axis=1)
    df['ndvi_iqr'] = df['ndvi_q75'] - df['ndvi_q25']
    df['ndvi_skew'] = df[ndvi_cols].apply(lambda x: skew(x.dropna()), axis=1)
    df['ndvi_kurtosis'] = df[ndvi_cols].apply(lambda x: kurtosis(x.dropna()), axis=1)

    # NDVI trend (slope)
    def compute_slope(row):
        y = row[ndvi_cols].values.reshape(-1, 1)
        if np.any(np.isnan(y)): return 0
        X_vals = np.arange(len(ndvi_cols)).reshape(-1, 1)
        model = LinearRegression().fit(X_vals, y)
        return model.coef_[0][0]

    df['ndvi_trend'] = df.apply(compute_slope, axis=1)
    return df

# â”€â”€â”€â”€â”€â”€â”€â”€ 4.3) Apply Feature Engineering â”€â”€â”€â”€â”€â”€â”€â”€
X = add_features(X)
X_test = add_features(X_test)

# â”€â”€â”€â”€â”€â”€â”€â”€ 5) Handle Missing Values â”€â”€â”€â”€â”€â”€â”€â”€
X.fillna(X.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

# â”€â”€â”€â”€â”€â”€â”€â”€ 6) Train/Test Split â”€â”€â”€â”€â”€â”€â”€â”€
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# â”€â”€â”€â”€â”€â”€â”€â”€ 7) Train Model â”€â”€â”€â”€â”€â”€â”€â”€
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# â”€â”€â”€â”€â”€â”€â”€â”€ 8) Evaluate â”€â”€â”€â”€â”€â”€â”€â”€
y_val_pred = model.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print(f"ðŸ“Š Validation Accuracy: {val_acc * 100:.2f}%")

# â”€â”€â”€â”€â”€â”€â”€â”€ 9) Predict on Test Set â”€â”€â”€â”€â”€â”€â”€â”€
y_test_pred = model.predict(X_test)

# â”€â”€â”€â”€â”€â”€â”€â”€ 10) Save Submission â”€â”€â”€â”€â”€â”€â”€â”€
submission = pd.DataFrame({
    "ID": test_ids,
    "class": y_test_pred
})
submission.to_csv("submission_old.csv", index=False)
print("âœ… Submission saved at submission_old.csv")


ðŸ“Š Validation Accuracy: 92.94%
âœ… Submission saved at submission_old.csv
