In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib


data = pd.read_csv('TrainingData.csv')  

# Define geographic coordinates for Spain, Kenya, and Vietnam
spain_coords = {'lat_min': 36.0, 'lat_max': 43.0, 'lon_min': -9.0, 'lon_max': 3.0}
kenya_coords = {'lat_min': -4.5, 'lat_max': 5.0, 'lon_min': 34.0, 'lon_max': 42.0}
vnm_coords = {'lat_min': 8.0, 'lat_max': 24.0, 'lon_min': 102.0, 'lon_max': 110.0}

# Function to filter data for a specific country
def filter_data(data, coords):
    return data[(data['lat'] >= coords['lat_min']) & (data['lat'] <= coords['lat_max']) & 
                (data['lon'] >= coords['lon_min']) & (data['lon'] <= coords['lon_max'])]

# Filter data for each country
data_spain = filter_data(data, spain_coords)
data_kenya = filter_data(data, kenya_coords)
data_vnm = filter_data(data, vnm_coords)

filtered_data = pd.concat([data_spain, data_kenya, data_vnm])

features = ['lon', 'lat', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50', 'VV_p50', 'VH_p50']
target = 'TARGET'

X = filtered_data[features]
y = filtered_data[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
numerical_features = ['lon', 'lat', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 
                      're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50', 
                      'VV_p50', 'VH_p50']


numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),                 # Standardize features
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Generate polynomial features
    # ('pca', PCA(n_components=10))                 # Reduce dimensionality
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features)
])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(n_estimators=300, num_leaves=83, random_state=42))
])

model_pipeline.fit(X_train, y_train)

val_accuracy = model_pipeline.score(X_val, y_val)
print(f"Validation accuracy: {val_accuracy}")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Number of positive: 1183, number of negative: 1077
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.086459 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 30345
[LightGBM] [Info] Number of data points in the train set: 2260, number of used features: 119
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.523451 -> initscore=0.093874
[LightGBM] [Info] Start training from score 0.093874
Validation accuracy: 0.9575221238938053
