In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import warnings
warnings.filterwarnings('ignore')
#filterwarnings('ignore'). This is useful in scenarios where warnings aren't relevant to your current execution and you want a clean output. import warnings warnings. filterwarnings('ignore') 
#Suppress all warnings warnings.

/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv
/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv


In [2]:
#Importing Necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from scipy.signal import savgol_filter
import warnings
warnings.filterwarnings("ignore")
print("Setup Complete")

Setup Complete


In [3]:
#Loading training dataset
hack_train_df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv")

# Drop unwanted columns
if 'Unnamed: 0' in hack_train_df.columns:
    hack_train_df.drop(columns=['Unnamed: 0'], inplace=True)

ndvi_columns = [col for col in hack_train_df.columns if col.endswith('_N')]
hack_train_df.shape

(8000, 29)

In [4]:
# SMOOTHING FUNCTION refers to techniques that reduce noise and variability in data, 
#revealing underlying trends and patterns
def smooth(row):
    if row.count() >= 5:
        return savgol_filter(row, window_length=5, polyorder=2)
    return row
def preprocess_train(df):
    df = df.copy()
    df[ndvi_columns] = df[ndvi_columns].apply(pd.to_numeric, errors='coerce')
    df[ndvi_columns] = df[ndvi_columns].interpolate(method='linear', axis=1, limit_direction='both')
    df[ndvi_columns] = df[ndvi_columns].fillna(df[ndvi_columns].mean())
    df[ndvi_columns] = df[ndvi_columns].apply(smooth, axis=1, result_type='broadcast')
    return df
hack_train_clean = preprocess_train(hack_train_df)

In [5]:
#  FEATURE ENGINEERING
def extract_features(df):
    train_df_features = pd.DataFrame()
    ndvi = df[ndvi_columns]

    train_df_features["mean"] = ndvi.mean(axis=1)
    train_df_features["std"] = ndvi.std(axis=1)
    train_df_features["max"] = ndvi.max(axis=1)
    train_df_features["min"] = ndvi.min(axis=1)
    train_df_features["range"] = train_df_features["max"] - train_df_features["min"]
    train_df_features["median"] = ndvi.median(axis=1)
    train_df_features["skew"] = ndvi.skew(axis=1)
    train_df_features["kurt"] = ndvi.kurtosis(axis=1)
    train_df_features["q75"] = ndvi.quantile(0.75, axis=1)
    train_df_features["q25"] = ndvi.quantile(0.25, axis=1)
    train_df_features["iqr"] = train_df_features["q75"] - train_df_features["q25"]
    train_df_features["argmax"] = ndvi.values.argmax(axis=1)

    half = ndvi.shape[1] // 2
    train_df_features["first_half_mean"] = ndvi.iloc[:, :half].mean(axis=1)
    train_df_features["second_half_mean"] = ndvi.iloc[:, half:].mean(axis=1)
    train_df_features["growth"] = train_df_features["second_half_mean"] - train_df_features["first_half_mean"]

    # Additional derived features
    train_df_features["peak_to_mean"] = train_df_features["max"] / (train_df_features["mean"] + 1e-6)
    train_df_features["norm_amplitude"] = (train_df_features["max"] - train_df_features["min"]) / (train_df_features["max"] + train_df_features["min"] + 1e-6)

    return train_df_features

In [6]:
# Prepare train data
X_train = extract_features(hack_train_clean)
y_train = hack_train_clean['class'].astype(str)

# Encode labels
label_encoder = LabelEncoder()
y_train_encode = label_encoder.fit_transform(y_train)

# Feature expansion and scaling
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif

polynomial_feature = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = polynomial_feature.fit_transform(X_train)

# Feature selection
selector = SelectKBest(score_func=f_classif, k='all')
X_train_selected = selector.fit_transform(X_train_poly, y_train_encode)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)

In [7]:
#  MODEL TRAINING
LR_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
LR_model.fit(X_train_scaled, y_train_encode)
train_predicts = LR_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train_encode, train_predicts)

print(f" Training Accuracy: {train_accuracy * 100:.2f}%")
print(" Data loaded")

 Training Accuracy: 86.30%
 Data loaded


In [8]:
#  LOAD & PREDICT ON TEST DATA
hack_test_df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")
if 'Unnamed: 0' in hack_test_df.columns:
    hack_test_df.drop(columns=['Unnamed: 0'], inplace=True)

In [9]:
# Prepocessing test dataset
def preprocess_test(df):
    df = df.copy()
    df[ndvi_columns] = df[ndvi_columns].apply(pd.to_numeric, errors='coerce')
    df[ndvi_columns] = df[ndvi_columns].interpolate(method='linear', axis=1, limit_direction='both')
    df[ndvi_columns] = df[ndvi_columns].fillna(df[ndvi_columns].mean())
    df[ndvi_columns] = df[ndvi_columns].apply(smooth, axis=1, result_type='broadcast')
    return df
hack_test_clean = preprocess_test(hack_test_df)
X_test = extract_features(hack_test_clean)
X_test_poly = polynomial_feature.transform(X_test)
X_test_selected = selector.transform(X_test_poly)
X_test_scaled = scaler.transform(X_test_selected)
test_predicts = LR_model.predict(X_test_scaled)
test_labels = label_encoder.inverse_transform(test_predicts)

In [10]:
#  EVALUATE ON TEST DATA (WITH LABELS)
if 'class' in hack_test_df.columns:
    y_test = hack_test_df['class'].astype(str)
    y_test_encode = label_encoder.transform(y_test)
    test_accuracy = accuracy_score(y_test_encode, test_predicts)
    print(f" Test Accuracy: {test_accuracy * 100:.2f}%")

    # Print full ID + predicted class format like leaderboard submission
    print(" Sample Prediction Output (ID,class):")
    for i in range(min(10, len(hack_test_df))):
        print(f"{hack_test_df['ID'].iloc[i]},{test_labels[i]}")
#  EXPORT SUBMISSION
submission = pd.DataFrame({
    'ID': hack_test_df['ID'],
    'class': test_labels
})
submission.to_csv("submission.csv", index=False)
print(submission.head())

   ID   class
0   1  forest
1   2  forest
2   3  forest
3   4  forest
4   5  forest
