In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv
/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report

In [3]:
# Load data
train_df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv").drop(columns=["Unnamed: 0"])
test_df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv").drop(columns=["Unnamed: 0"])

In [4]:
#train_df.head()
#test_df.head()

In [5]:
# Identify NDVI columns
ndvi_columns = [col for col in train_df.columns if "_N" in col]

In [6]:
# Split features and target
X_train_raw = train_df[ndvi_columns]
y_train_raw = train_df["class"]
X_test_raw = test_df[ndvi_columns]
test_ids = test_df["ID"]

In [7]:
# Checking for NULL values 
train_df.isnull().sum()

ID               0
class            0
20150720_N     560
20150602_N    1200
20150517_N     800
20150501_N     960
20150415_N     480
20150330_N    1120
20150314_N     720
20150226_N    1360
20150210_N     640
20150125_N    1040
20150109_N     880
20141117_N    1280
20141101_N     400
20141016_N    1440
20140930_N     800
20140813_N     560
20140626_N    1600
20140610_N     480
20140525_N     720
20140509_N     880
20140423_N    1760
20140407_N     640
20140322_N    1120
20140218_N    1440
20140202_N     560
20140117_N    1200
20140101_N     400
dtype: int64

In [8]:
# Handle missing values using mean imputation
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train_raw)
X_test_imputed = imputer.transform(X_test_raw)

In [9]:
# Scale the NDVI values
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [10]:
# Feature Engineering Function
def extract_features(data):
    return pd.DataFrame({
        'mean': data.mean(axis=1),
        'std': data.std(axis=1),
        'min': data.min(axis=1),
        'max': data.max(axis=1),
        'range': data.max(axis=1) - data.min(axis=1),
        'missing_frac': np.isnan(data).mean(axis=1),
        'ndvi_sum': data.sum(axis=1),
        'median': np.median(data, axis=1),
    })

In [11]:
# Extract features
X_train_features = extract_features(X_train_scaled)
X_test_features = extract_features(X_test_scaled)

In [12]:
# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)

In [13]:
from sklearn.model_selection import train_test_split

# Split training data into training and validation sets
X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train_features, y_train, test_size=0.2, random_state=42, stratify=y_train)


In [14]:
# Train the model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
model.fit(X_subtrain, y_subtrain)

In [15]:
# Evaluate with validation set
y_val_pred = model.predict(X_val)
print("📊 Classification Report on Validation Set:")
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

📊 Classification Report on Validation Set:
              precision    recall  f1-score   support

        farm       0.42      0.12      0.19       168
      forest       0.87      0.99      0.92      1232
       grass       0.57      0.10      0.17        39
  impervious       0.76      0.78      0.77       134
     orchard       0.00      0.00      0.00         6
       water       0.89      0.38      0.53        21

    accuracy                           0.84      1600
   macro avg       0.58      0.39      0.43      1600
weighted avg       0.80      0.84      0.81      1600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Cross-validation score
cv_score = cross_val_score(model, X_train_features, y_train, cv=5, scoring='accuracy').mean()
print("✅ Cross-Validation Accuracy:", round(cv_score * 100, 2), "%")

✅ Cross-Validation Accuracy: 83.58 %


In [17]:
# Final model training on full data
model.fit(X_train_features, y_train)

In [18]:
# Predict on test set
y_test_preds = label_encoder.inverse_transform(model.predict(X_test_features))

In [19]:
# Prepare submission
submission_df = pd.DataFrame({
    "ID": test_ids,
    "class": y_test_preds
})

In [20]:
submission_df.to_csv("ndvi_logreg_submission.csv", index=False)
print("✅ Submission file saved as: ndvi_logreg_submission.csv")

✅ Submission file saved as: ndvi_logreg_submission.csv


In [21]:
submission_df.head()

Unnamed: 0,ID,class
0,1,forest
1,2,forest
2,3,forest
3,4,forest
4,5,forest
