<a href="https://colab.research.google.com/github/deeptanshurai/deeptanshu_iitg_analytics/blob/main/iitg_analytics_hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from scipy.signal import savgol_filter

try:


    url_1 = 'https://raw.githubusercontent.com/deeptanshurai/deeptanshu_iitg_analytics/refs/heads/main/hacktrain.csv'
    url_2 = 'https://raw.githubusercontent.com/deeptanshurai/deeptanshu_iitg_analytics/refs/heads/main/hacktest.csv'
    train_df = pd.read_csv(url_1)
    test_df = pd.read_csv(url_2)


    # Drop 'Unnamed: 0' column
    train_df = train_df.drop('Unnamed: 0', axis=1)
    test_df = test_df.drop('Unnamed: 0', axis=1)

    # Get NDVI
    ndvi_cols = [col for col in train_df.columns if '_N' in col]
    sorted_ndvi_cols = sorted(ndvi_cols, key=lambda x: pd.to_datetime(x.split('_')[0], format='%Y%m%d'))

    train_df = train_df[['ID', 'class'] + sorted_ndvi_cols]
    test_df = test_df[['ID'] + sorted_ndvi_cols]

    # --- Feature Engineering ---

    # 1. Imputation
    train_df[sorted_ndvi_cols] = train_df[sorted_ndvi_cols].T.interpolate(method='linear').T
    # edge cases
    train_df[sorted_ndvi_cols] = train_df[sorted_ndvi_cols].bfill(axis=1).ffill(axis=1)

    # 2.Savitzky-Golay filter

    window_length = 5
    polyorder = 2
    train_df[sorted_ndvi_cols] = savgol_filter(train_df[sorted_ndvi_cols], window_length, polyorder, axis=1)
    test_df[sorted_ndvi_cols] = savgol_filter(test_df[sorted_ndvi_cols], window_length, polyorder, axis=1)




    def get_advanced_features(df):
        features = pd.DataFrame()


        features['mean_ndvi'] = df[sorted_ndvi_cols].mean(axis=1)
        features['median_ndvi'] = df[sorted_ndvi_cols].median(axis=1)
        features['std_ndvi'] = df[sorted_ndvi_cols].std(axis=1)
        features['max_ndvi'] = df[sorted_ndvi_cols].max(axis=1)
        features['min_ndvi'] = df[sorted_ndvi_cols].min(axis=1)
        features['range_ndvi'] = features['max_ndvi'] - features['min_ndvi']


        months = [int(col.split('_')[0][4:6]) for col in sorted_ndvi_cols]
        winter_cols = [col for col, m in zip(sorted_ndvi_cols, months) if m in [12, 1, 2]]
        spring_cols = [col for col, m in zip(sorted_ndvi_cols, months) if m in [3, 4, 5]]
        summer_cols = [col for col, m in zip(sorted_ndvi_cols, months) if m in [6, 7, 8]]
        autumn_cols = [col for col, m in zip(sorted_ndvi_cols, months) if m in [9, 10, 11]]

        features['winter_mean_ndvi'] = df[winter_cols].mean(axis=1)
        features['spring_mean_ndvi'] = df[spring_cols].mean(axis=1)
        features['summer_mean_ndvi'] = df[summer_cols].mean(axis=1)
        features['autumn_mean_ndvi'] = df[autumn_cols].mean(axis=1)


        fft_features = np.fft.fft(df[sorted_ndvi_cols], axis=1)

        fft_magnitudes = np.abs(fft_features)


        for i in range(1, 6):
            features[f'fft_mag_{i}'] = fft_magnitudes[:, i]

        return features

    X_train_features = get_advanced_features(train_df)
    X_test_features = get_advanced_features(test_df)


    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_features)
    X_test_scaled = scaler.transform(X_test_features)


    le = LabelEncoder()
    y_train = le.fit_transform(train_df['class'])

    param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

    # Initialize the Logistic Regression model
    log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=3000, random_state=42)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)


    grid_search.fit(X_train_scaled, y_train)

    print(f"Best C value found: {grid_search.best_params_['C']}")


    best_log_reg = grid_search.best_estimator_

    # Prediction
    test_predictions = best_log_reg.predict(X_test_scaled)
    test_predictions_labels = le.inverse_transform(test_predictions)


    submission_df = pd.DataFrame({'ID': test_df['ID'], 'class': test_predictions_labels})
    submission_df.to_csv('download.csv', index=False)

    print("\nAdvanced submission file 'download.csv' created successfully!")
    print("\nFirst 5 rows of the advanced submission file:")
    print(submission_df.head(40))

except FileNotFoundError as e:
    print(e)
except Exception as e:
    print(f"An error occurred: {e}")



Best C value found: 1

Advanced submission file 'download.csv' created successfully!

First 5 rows of the advanced submission file:
    ID       class
0    1      forest
1    2      forest
2    3      forest
3    4      forest
4    5      forest
5    6      forest
6    7      forest
7    8      forest
8    9      forest
9   10        farm
10  11      forest
11  12      forest
12  13      forest
13  14      forest
14  15      forest
15  16      forest
16  17      forest
17  18      forest
18  19      forest
19  20      forest
20  21      forest
21  22      forest
22  23      forest
23  24       water
24  25       water
25  26       water
26  27  impervious
27  28      forest
28  29       water
29  30       water
30  31       water
31  32      forest
32  33       water
33  34      forest
34  35  impervious
35  36        farm
36  37       water
37  38       water
38  39       water
39  40  impervious
