# NDVI-Stats: Fast & Interpretable Land Cover Classification


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Import Libraries

In [None]:
from google.colab import files
uploaded = files.upload()



## 2. Load Dataset

In [None]:
train_df = pd.read_csv('hacktrain.csv')
test_df = pd.read_csv('hacktest.csv')

## 3. Exploratory Data Analysis (EDA)

In [None]:
train_df.head(10)

## 4. Feature Engineering

In [None]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

## 5. Model Training (Logistic Regression)

In [None]:
test_df.head(10)

## 6. Results & Evaluation

In [None]:
#Checking the missing values
missing_counts = train_df.isnull().sum()
print("Missing values in training data:")
print(missing_counts[missing_counts > 0])

## 7. Conclusion

In [None]:
#Checking the unique classes
print("Unique classes in training set:", train_df['class'].unique())

In [None]:
# how the classes are distributed in the dataset
train_df['class'].value_counts().plot(kind='bar', title="Class Distribution")

In [None]:
# List of NDVI time-series columns
ndvi_cols = [col for col in train_df.columns if '_N' in col]
print("Total NDVI time-series columns:", len(ndvi_cols))


In [None]:
# Fill missing NDVI values with median (can be changed to mean or interpolation)
train_df[ndvi_cols] = train_df[ndvi_cols].fillna(train_df[ndvi_cols].median())
test_df[ndvi_cols] = test_df[ndvi_cols].fillna(test_df[ndvi_cols].median())


In [None]:
# Feature functions
def extract_features(df):
    df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    df['ndvi_std'] = df[ndvi_cols].std(axis=1)
    df['ndvi_min'] = df[ndvi_cols].min(axis=1)
    df['ndvi_max'] = df[ndvi_cols].max(axis=1)
    df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']
    df['ndvi_slope'] = df[ndvi_cols].apply(lambda row: row.values[-1] - row.values[0], axis=1)
    return df

# Apply on both train and test
train_df = extract_features(train_df)
test_df = extract_features(test_df)

# Final features to train on
feature_cols = ['ndvi_mean', 'ndvi_std', 'ndvi_min', 'ndvi_max', 'ndvi_range', 'ndvi_slope']


In [None]:
#Plot NDVI Time-Series Trends by class
import matplotlib.pyplot as plt

# Columns representing NDVI time-series
ndvi_cols = [col for col in train_df.columns if '_N' in col]

# Group by class and compute mean NDVI over time
ndvi_by_class = train_df.groupby('class')[ndvi_cols].mean().T

# Plot
plt.figure(figsize=(14, 8))
for land_type in ndvi_by_class.columns:
    plt.plot(ndvi_by_class.index, ndvi_by_class[land_type], label=land_type)

plt.xticks(rotation=45)
plt.xlabel("Date (NDVI Columns)")
plt.ylabel("Average NDVI Value")
plt.title("NDVI Trends Over Time by Land Cover Type")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['class'])


In [None]:
print(train_df.columns)


In [None]:
from scipy.stats import skew, kurtosis

def advanced_features(df):
    df['ndvi_skew'] = df[ndvi_cols].apply(lambda row: skew(row, nan_policy='omit'), axis=1)
    df['ndvi_kurtosis'] = df[ndvi_cols].apply(lambda row: kurtosis(row, nan_policy='omit'), axis=1)

    # Split into 3 periods (early, mid, late)
    thirds = len(ndvi_cols) // 3
    df['ndvi_early_avg'] = df[ndvi_cols[:thirds]].mean(axis=1)
    df['ndvi_mid_avg'] = df[ndvi_cols[thirds:2*thirds]].mean(axis=1)
    df['ndvi_late_avg'] = df[ndvi_cols[2*thirds:]].mean(axis=1)

    # NDVI above threshold (e.g., healthy vegetation > 0.4)
    df['ndvi_above_0.4'] = df[ndvi_cols].gt(0.4).sum(axis=1)

    return df

# Apply on both train and test
train_df = advanced_features(train_df)
test_df = advanced_features(test_df)

# Update feature list
feature_cols += ['ndvi_skew', 'ndvi_kurtosis', 'ndvi_early_avg', 'ndvi_mid_avg', 'ndvi_late_avg', 'ndvi_above_0.4']


In [None]:
X = train_df[feature_cols]
y = train_df['label']


# Same train-test split and model as before...


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Feature and label
X = train_df[feature_cols]
y = train_df['label']

# Split for local validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Build pipeline: Standardize + Logistic Regression
model = make_pipeline(StandardScaler(), LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000))
model.fit(X_train, y_train)

# Validation accuracy
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))


In [None]:
#Predict on the test data
X_test = test_df[feature_cols]
test_preds = model.predict(X_test)

# Convert numerical labels back to original class names
test_preds_labels = le.inverse_transform(test_preds)

# Create submission DataFrame
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'class': test_preds_labels
})

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
from google.colab import files
files.download('submission.csv')
