In [1]:
def engineer_features(df):
    # Normalize the reflectance bands
    reflectance_bands = ['blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50']

    df['API'] = (df['swir1_p50'] - (df['red_p50'] + df['blue_p50'])) / (df['swir1_p50'] + (df['red_p50'] + df['blue_p50']))
    df['NDVI'] = (df['nir_p50'] - df['red_p50']) / (df['nir_p50'] + df['red_p50'])
    df['NDWI'] = (df['green_p50'] - df['nir_p50']) / (df['green_p50'] + df['nir_p50'])
    df['SAVI'] = (df['nir_p50'] - df['red_p50']) / (df['nir_p50'] + df['red_p50'] + 0.5) * 1.5
    df['EVI'] = 2.5 * (df['nir_p50'] - df['red_p50']) / (df['nir_p50'] + 6 * df['red_p50'] - 7.5 * df['blue_p50'] + 1)
    df['NDBI'] = (df['swir1_p50'] - df['nir_p50']) / (df['swir1_p50'] + df['nir_p50'])
    df['NDPI'] = (df['swir1_p50'] - df['blue_p50']) / (df['swir1_p50'] + df['blue_p50'])
    df['Composite_Index'] = df['API'] * df['NDVI'] * df['NDWI']

    # New indices
    df['GNDVI'] = (df['nir_p50'] - df['green_p50']) / (df['nir_p50'] + df['green_p50'])
    df['RENDVI'] = (df['re1_p50'] - df['re2_p50']) / (df['re1_p50'] + df['re2_p50'])
    df['PGI'] = (df['green_p50'] - df['blue_p50']) / (df['green_p50'] + df['blue_p50'])
    df['BSCI'] = (df['swir1_p50'] - df['blue_p50']) / (df['swir1_p50'] + df['blue_p50'])
    df['RVI'] = df['nir_p50'] / df['red_p50']
    return df

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('TrainingData.csv')
data = engineer_features(data)

# def check_correlations_and_select_features(df, target_col='TARGET', correlation_threshold=0.7, top_n_features=20):
#     # Separate features and target
#     features = df.drop(columns=[target_col, 'ID', 'lon', 'lat'])
#     target = df[target_col]

#     # Encode target variable
#     le = LabelEncoder()
#     target_encoded = le.fit_transform(target)

#     # Calculate correlations
#     corr_matrix = features.corr()

#     # Plot correlation heatmap
#     plt.figure(figsize=(20, 16))
#     sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
#     plt.title('Feature Correlation Heatmap')
#     plt.tight_layout()
#     plt.show()

#     # Calculate correlation with target (use Spearman for categorical target)
#     target_correlations = []
#     for col in features.columns:
#         correlation, _ = spearmanr(features[col], target_encoded)
#         target_correlations.append((col, abs(correlation)))

#     # Sort features by correlation with target
#     target_correlations.sort(key=lambda x: x[1], reverse=True)


#     print("Top correlated features with target:")
#     for feature, correlation in target_correlations[:25]:
#         print(f"{feature}: {correlation:.4f}")



In [None]:
selected_features = check_correlations_and_select_features(data)

In [5]:
from lightgbm import LGBMClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib

features = ['blue_p50', 'green_p50', 'nir_p50', 'nira_p50',
       're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50',
       'VV_p50', 'VH_p50','API','NDVI',
       'NDBI', 'NDPI', 'Composite_Index', 'GNDVI', 'RENDVI', 'PGI', 'BSCI']

X = data[features]
y = data['TARGET']

model = LGBMClassifier(random_state=42,n_estimators=300,num_leaves=83,max_depth=30,min_child_samples=30,learning_rate=0.1)

kf = KFold(n_splits=10, shuffle=True, random_state=42)


cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
model.fit(X, y)

print("Cross-Validation Accuracy Scores: ", cv_scores)
print("Mean Cross-Validation Accuracy: ", cv_scores.mean())
print("Standard Deviation of Cross-Validation Accuracy: ", cv_scores.std())

[LightGBM] [Info] Number of positive: 1334, number of negative: 1208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082548 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5355
[LightGBM] [Info] Number of data points in the train set: 2542, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.524784 -> initscore=0.099216
[LightGBM] [Info] Start training from score 0.099216
[LightGBM] [Info] Number of positive: 1348, number of negative: 1194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000392 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5355
[LightGBM] [Info] Number of data points in the train set: 2542, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.530291 -> initscore=0.121313
[LightGBM] [Info] Start training from score 0.121313
[LightGBM] [Info] Numb

In [None]:
import pandas as pd

def add_id_column(df, file_name):
  """Adds an ID column based on the file name and index."""
  file_prefix = file_name.split('_')[0]
  df['ID'] = df.index.map(lambda x: f"{file_prefix}_{x+1}")
  return df

t1 = pd.read_csv('Kenya_testing.csv')
t2 = pd.read_csv('Spain_validation.csv')
t3 = pd.read_csv('VNM_testing.csv')


test1 = add_id_column(t1, 'Kenya')  
test2 = add_id_column(t2, 'Spain')
test3 = add_id_column(t3, 'VNM')

t1 = pd.DataFrame(test1)
t2 = pd.DataFrame(test2)
t3 = pd.DataFrame(test3)



sample_submission = pd.read_csv('SampleSubmission.csv')
print(sample_submission.head())
test = pd.concat([t1, t2, t3], ignore_index=True, join='outer')
Test = pd.DataFrame(test)

merged_data = pd.merge(sample_submission, Test, on='ID', how='left')

merged_data=engineer_features(merged_data)
merged_data


In [None]:
merged_data.columns

In [None]:
testdata = merged_data.drop(columns=['ID','TARGET', 'RVI','NDWI', 'SAVI', 'EVI'])

test_pred = model.predict(testdata)

submission_pred = pd.DataFrame({'ID': sample_submission['ID'], 'TARGET': test_pred})

submission_pred.to_csv('pred_submission_9_LGMC.csv', index=False)

submission_pred.head()