In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load training data
df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv")
print(f"Training data shape: {df.shape}")
df

In [None]:
print(f"Missing values before cleaning:\n{df.isnull().sum()}")

In [None]:
# Improved missing value handling by using median instead of mean
numeric_columns = df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    if col != 'ID':  # Don't fill ID column
        df[col].fillna(df[col].median(), inplace=True)

print(f"Missing values after cleaning:\n{df.isnull().sum()}")

In [None]:
# Basic feature engineering 
ndvi_columns = [col for col in df.columns if col not in ['ID', 'class']]

# Add some simple time series features
df['ndvi_mean'] = df[ndvi_columns].mean(axis=1)
df['ndvi_std'] = df[ndvi_columns].std(axis=1)
df['ndvi_max'] = df[ndvi_columns].max(axis=1)
df['ndvi_min'] = df[ndvi_columns].min(axis=1)
df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']

# Seasonal features (assuming columns are chronologically ordered)
if len(ndvi_columns) >= 4:
    quarter_size = len(ndvi_columns) // 4
    df['ndvi_q1'] = df[ndvi_columns[:quarter_size]].mean(axis=1)
    df['ndvi_q2'] = df[ndvi_columns[quarter_size:2*quarter_size]].mean(axis=1)
    df['ndvi_q3'] = df[ndvi_columns[2*quarter_size:3*quarter_size]].mean(axis=1)
    df['ndvi_q4'] = df[ndvi_columns[3*quarter_size:]].mean(axis=1)

print(f"Data shape after feature engineering: {df.shape}")

In [None]:
# Drop ID column
df.drop(columns=['ID'], inplace=True)

In [None]:
# Encode target variable
label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])

print(f"Classes: {label_encoder.classes_}")
print(f"Class distribution:\n{pd.Series(df['class']).value_counts().sort_index()}")

In [None]:
# Split features and target
X = df.drop(columns=['class'])
y = df['class']

In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Try multiple simple models and pick the best one
models = {
    'Logistic Regression': LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=1000,  # Increased iterations
        random_state=42,
        C=1.0  # Regularization parameter
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        max_depth=10,  # Prevent overfitting
        min_samples_split=5,
        min_samples_leaf=2
    )
}

best_model = None
best_score = 0
best_name = ""

print("\nModel Comparison (5-fold CV):")
for name, model in models.items():
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    mean_score = cv_scores.mean()
    std_score = cv_scores.std()
    
    print(f"{name}: {mean_score:.4f} (+/- {std_score*2:.4f})")
    
    if mean_score > best_score:
        best_score = mean_score
        best_model = model
        best_name = name

print(f"\nBest model: {best_name}")

In [None]:
# Train the best model
best_model.fit(X_train, y_train)

# Evaluate on validation set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nValidation Accuracy: {accuracy:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(
    y_test,
    y_pred,
    labels=list(range(len(label_encoder.classes_))),
    target_names=label_encoder.classes_
))

In [None]:
# Load test data
test_data = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")
print(f"\nTest data shape: {test_data.shape}")

In [None]:
# Store IDs
test_ids = test_data['ID'].copy()

# Drop ID column
test_data.drop(['ID'], axis=1, inplace=True)

In [None]:
# Handle missing values in test data (same way as training data)
for col in test_data.columns:
    if test_data[col].isnull().any():
        # Use training data statistics for imputation
        train_median = df[col].median() if col in df.columns else test_data[col].median()
        test_data[col].fillna(train_median, inplace=True)

# Apply same feature engineering to test data
test_ndvi_columns = [col for col in test_data.columns]

test_data['ndvi_mean'] = test_data[test_ndvi_columns].mean(axis=1)
test_data['ndvi_std'] = test_data[test_ndvi_columns].std(axis=1)
test_data['ndvi_max'] = test_data[test_ndvi_columns].max(axis=1)
test_data['ndvi_min'] = test_data[test_ndvi_columns].min(axis=1)
test_data['ndvi_range'] = test_data['ndvi_max'] - test_data['ndvi_min']

# Seasonal features for test data
if len(test_ndvi_columns) >= 4:
    quarter_size = len(test_ndvi_columns) // 4
    test_data['ndvi_q1'] = test_data[test_ndvi_columns[:quarter_size]].mean(axis=1)
    test_data['ndvi_q2'] = test_data[test_ndvi_columns[quarter_size:2*quarter_size]].mean(axis=1)
    test_data['ndvi_q3'] = test_data[test_ndvi_columns[2*quarter_size:3*quarter_size]].mean(axis=1)
    test_data['ndvi_q4'] = test_data[test_ndvi_columns[3*quarter_size:]].mean(axis=1)

# Scale test data using the same scaler
test_data_scaled = scaler.transform(test_data)

# Make predictions
y_test_pred = best_model.predict(test_data_scaled)

# Convert predictions back to original class labels
y_decoded = label_encoder.inverse_transform(y_test_pred)

print(f"\nPrediction distribution:")
unique, counts = np.unique(y_decoded, return_counts=True)
for class_name, count in zip(unique, counts):
    print(f"{class_name}: {count}")

# Create submission file
result = pd.DataFrame({
    'ID': test_ids,
    'class': y_decoded
})

print(f"\nSubmission shape: {result.shape}")
print(result)

# Save submission
result.to_csv("submission.csv", index=False)
print("\nSubmission file saved as 'submission.csv'")