## Submission 
- each region is trained seperately due to different cropping conditions
- only light-weight classifiers were considered
- for each region, the best hyper-parameters found during training is used
- using spectral indices alone was better than raw bands or their combinations with indices


In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import catboost as ctb
import os
import pandas as pd
import numpy as np
from utils_datapreparation import *

## load indices data for each region

In [47]:
path = '/app/stella/dev/GeoITU/GeoITU_CropMapping'

data_path = os.path.join(path, 'data')
submission_loc = os.path.join(path, 'submission')

## load train
afgan_indices_train_X, afgan_train_y, = load_data('afghan', 'indices', 'Train', data_path)
sudan_indices_train_X, sudan_train_y, = load_data('sudan', 'indices', 'Train', data_path)
iran_indices_train_X, iran_train_y, = load_data('iran', 'indices', 'Train', data_path)

## load test
afgan_indices_test_X = load_data('afghan', 'indices', 'Test', data_path)
sudan_indices_test_X = load_data('sudan', 'indices', 'Test', data_path)
iran_indices_test_X = load_data('iran', 'indices', 'Test', data_path)

In [48]:
## load test ids
afgan_ID_test_X = np.load(os.path.join(data_path, 'afghan_Test_ids.npy'), allow_pickle=True)
sudan_ID_test_X = np.load(os.path.join(data_path, 'sudan_Test_ids.npy'), allow_pickle=True)
iran_ID_test_X = np.load(os.path.join(data_path, 'iran_Test_ids.npy'), allow_pickle=True)

## initialize models using best hyper-parameters

- model training and cross-validation facilitated the selection of the best parameters
- models are refitted on all training samples to predict test
- several models are tried using raw bands only, indices only and raw bands + indices
- Random forest classifier was marginally better and performed well with indices only

In [61]:
iran_params = {'n_estimators': 352, 'max_depth': 8, 'min_samples_split': 19, 'bootstrap': False, 'n_jobs': -1}
sudan_params = {'n_estimators': 167, 'max_depth': 14, 'min_samples_split': 31, 'bootstrap': True, 'n_jobs': -1}

clf_afghan =  ctb.CatBoostClassifier(iterations=1700).fit(afgan_indices_train_X.reshape(afgan_indices_train_X.shape[0], -1), afgan_train_y, verbose=False)
clf_iran = RandomForestClassifier(**iran_params).fit(iran_indices_train_X.reshape(iran_indices_train_X.shape[0], -1), iran_train_y)
clf_sudan = RandomForestClassifier(**sudan_params).fit(sudan_indices_train_X.reshape(sudan_indices_train_X.shape[0], -1), sudan_train_y)

pred_afghan = clf_afghan.predict(afgan_indices_test_X.reshape(afgan_indices_test_X.shape[0], -1))
pred_iran = clf_iran.predict(iran_indices_test_X.reshape(iran_indices_test_X.shape[0], -1))
pred_sudan = clf_sudan.predict(sudan_indices_test_X.reshape(sudan_indices_test_X.shape[0], -1))

# # create submission file
stacked_ids = np.concatenate([afgan_ID_test_X, iran_ID_test_X, sudan_ID_test_X])
stacked_pred = np.concatenate([pred_afghan, pred_iran, pred_sudan])
df = pd.DataFrame(list(zip(stacked_ids, stacked_pred)), columns =['ID', 'Target'])
df.to_csv(os.path.join(submission_loc, 'indicesafghan_cat_bands_04102023_test2.csv'), index=False)