In [1]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder,StandardScaler
import joblib

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
data = pd.read_csv('TrainingData.csv')  


# spain_coords = {'lat_min': 36.0, 'lat_max': 43.0, 'lon_min': -9.0, 'lon_max': 3.0}
# kenya_coords = {'lat_min': -4.5, 'lat_max': 5.0, 'lon_min': 34.0, 'lon_max': 42.0}
# vnm_coords = {'lat_min': 8.0, 'lat_max': 24.0, 'lon_min': 102.0, 'lon_max': 110.0}


# def filter_data(data, coords):
#     return data[(data['lat'] >= coords['lat_min']) & (data['lat'] <= coords['lat_max']) & 
#                 (data['lon'] >= coords['lon_min']) & (data['lon'] <= coords['lon_max'])]


# data_spain = filter_data(data, spain_coords)
# data_kenya = filter_data(data, kenya_coords)
# data_vnm = filter_data(data, vnm_coords)


# filtered_data = pd.concat([data_spain, data_kenya, data_vnm])


In [3]:
def engineer_features(df):

    reflectance_bands = ['blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50']

    df['API'] = (df['swir1_p50'] - (df['red_p50'] + df['blue_p50'])) / (df['swir1_p50'] + (df['red_p50'] + df['blue_p50']))
    df['NDVI'] = (df['nir_p50'] - df['red_p50']) / (df['nir_p50'] + df['red_p50'])
    df['NDWI'] = (df['green_p50'] - df['nir_p50']) / (df['green_p50'] + df['nir_p50'])
    df['SAVI'] = (df['nir_p50'] - df['red_p50']) / (df['nir_p50'] + df['red_p50'] + 0.5) * 1.5
    df['EVI'] = 2.5 * (df['nir_p50'] - df['red_p50']) / (df['nir_p50'] + 6 * df['red_p50'] - 7.5 * df['blue_p50'] + 1)
    df['NDBI'] = (df['swir1_p50'] - df['nir_p50']) / (df['swir1_p50'] + df['nir_p50'])
    df['NDPI'] = (df['swir1_p50'] - df['blue_p50']) / (df['swir1_p50'] + df['blue_p50'])
    df['Composite_Index'] = df['API'] * df['NDVI'] * df['NDWI']

    # New indices
    df['GNDVI'] = (df['nir_p50'] - df['green_p50']) / (df['nir_p50'] + df['green_p50'])
    df['RENDVI'] = (df['re1_p50'] - df['re2_p50']) / (df['re1_p50'] + df['re2_p50'])
    df['PGI'] = (df['green_p50'] - df['blue_p50']) / (df['green_p50'] + df['blue_p50'])
    df['BSCI'] = (df['swir1_p50'] - df['blue_p50']) / (df['swir1_p50'] + df['blue_p50'])
    df['RVI'] = df['nir_p50'] / df['red_p50']
    return df

In [4]:

features = ['lon', 'lat', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50', 'VV_p50', 'VH_p50']
features_current_best = ['lon', 'lat', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50',
       're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50',
       'VV_p50', 'VH_p50','API','NDVI',
       'NDBI', 'NDPI', 'Composite_Index', 'GNDVI', 'RENDVI', 'PGI', 'BSCI']

features_new = ['lon', 'lat', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50',
       're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50',
       'API','NDVI',
       'NDBI', 'NDPI', 'Composite_Index', 'GNDVI', 'RENDVI', 'PGI', 'BSCI']

target = 'TARGET'

# Save the model
# joblib.dump(model, 'plastic_cover_classifier.pkl')
# joblib.dump(label_encoder, 'label_encoder.pkl')

# y_pred = model.predict(X_test)  
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [16]:
engineer_features(data)
se = StandardScaler()
X = data[features]
# X = se.fit_transform(X)
y = data[target]

# model = RandomForestClassifier(n_estimators=500, random_state=42,max_depth=40)
# Best parameters RandomSearch:  {'reg_lambda': 0.1, 'reg_alpha': 0.1, 'num_leaves': 80, 'n_estimators': 250, 'min_child_samples': 35, 'max_depth': 20, 'learning_rate': 0.3}
model = LGBMClassifier(random_state=42,n_estimators=300,num_leaves=90,min_child_samples=45,max_depth=40,colsample_bytree=0.8,learning_rate=0.05,reg_alpha=0.1,subsample=0.1)
#  END colsample_bytree=0.8, learning_rate=0.05, max_depth=40, min_child_samples=45, n_estimators=300, num_leaves=90, reg_alpha=0.1, reg_lambda=0, subsample=1.0; total time= 6.9min
# kf = KFold(n_splits=10, shuffle=True, random_state=42)


cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')


# model.fit(X, y)

[LightGBM] [Info] Number of positive: 1334, number of negative: 1208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 2542, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.524784 -> initscore=0.099216
[LightGBM] [Info] Start training from score 0.099216
[LightGBM] [Info] Number of positive: 1348, number of negative: 1194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002992 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 2542, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.530291 -> initscore=0.121313
[LightGBM] [Info] Start training from score 0.121313
[LightGBM] [Info] Numb

In [18]:
# Print cross-validation results
print("Cross-Validation Accuracy Scores: ", cv_scores)
print("Mean Cross-Validation Accuracy: ", cv_scores.mean())
print("Standard Deviation of Cross-Validation Accuracy: ", cv_scores.std())

Cross-Validation Accuracy Scores:  [0.97526502 0.97173145 0.96819788 0.96113074 0.97526502 0.9822695
 0.9822695  0.96808511 0.96808511 0.96099291]
Mean Cross-Validation Accuracy:  0.9713292233666643
Standard Deviation of Cross-Validation Accuracy:  0.007156986394114929


In [None]:
import pandas as pd

def add_id_column(df, file_name):
  """Adds an ID column based on the file name and index."""
  file_prefix = file_name.split('_')[0]
  df['ID'] = df.index.map(lambda x: f"{file_prefix}_{x+1}")
  return df

t1 = pd.read_csv('Kenya_testing.csv')
t2 = pd.read_csv('Spain_validation.csv')
t3 = pd.read_csv('VNM_testing.csv')


test1 = add_id_column(t1, 'Kenya')  
test2 = add_id_column(t2, 'Spain')
test3 = add_id_column(t3, 'VNM')

t1 = pd.DataFrame(test1)
t2 = pd.DataFrame(test2)
t3 = pd.DataFrame(test3)



sample_submission = pd.read_csv('SampleSubmission.csv')
print(sample_submission.head())
test = pd.concat([t1, t2, t3], ignore_index=True, join='outer')
Test = pd.DataFrame(test)

merged_data = pd.merge(sample_submission, Test, on='ID', how='left')

merged_data=engineer_features(merged_data)
merged_data


In [None]:
merged_data.columns

In [None]:
testdata = merged_data[features_new]

test_pred = model.predict(testdata)

submission_pred = pd.DataFrame({'ID': sample_submission['ID'], 'TARGET': test_pred})

submission_pred.to_csv('pred_submission_101_LGMC.csv', index=False)
submission_pred.head()