In [1]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib


data = pd.read_csv('TrainingData.csv')  

# Define geographic coordinates for Spain, Kenya, and Vietnam
spain_coords = {'lat_min': 36.0, 'lat_max': 43.0, 'lon_min': -9.0, 'lon_max': 3.0}
kenya_coords = {'lat_min': -4.5, 'lat_max': 5.0, 'lon_min': 34.0, 'lon_max': 42.0}
vnm_coords = {'lat_min': 8.0, 'lat_max': 24.0, 'lon_min': 102.0, 'lon_max': 110.0}

# Function to filter data for a specific country
def filter_data(data, coords):
    return data[(data['lat'] >= coords['lat_min']) & (data['lat'] <= coords['lat_max']) & 
                (data['lon'] >= coords['lon_min']) & (data['lon'] <= coords['lon_max'])]

# Filter data for each country
data_spain = filter_data(data, spain_coords)
data_kenya = filter_data(data, kenya_coords)
data_vnm = filter_data(data, vnm_coords)

# Combine the filtered data
filtered_data = pd.concat([data_spain, data_kenya, data_vnm])


[LightGBM] [Info] Number of positive: 1324, number of negative: 1218
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000282 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 2542, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.520850 -> initscore=0.083447
[LightGBM] [Info] Start training from score 0.083447
[LightGBM] [Info] Number of positive: 1340, number of negative: 1202
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000316 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 2542, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.527144 -> initscore=0.108683
[LightGBM] [Info] Start training from score 0.108683
[LightGBM] [Info] Numb

In [17]:
# Define features and target
features = ['lon', 'lat', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50', 'VV_p50', 'VH_p50']
features_new = ['lon', 'lat', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50',
       're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50',
       'VV_p50', 'VH_p50','API','NDVI',
       'NDBI', 'NDPI', 'Composite_Index', 'GNDVI', 'RENDVI', 'PGI', 'BSCI']
target = 'TARGET'

def engineer_features(df):
    # Normalize the reflectance bands
    reflectance_bands = ['blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50']

    df['API'] = (df['swir1_p50'] - (df['red_p50'] + df['blue_p50'])) / (df['swir1_p50'] + (df['red_p50'] + df['blue_p50']))
    df['NDVI'] = (df['nir_p50'] - df['red_p50']) / (df['nir_p50'] + df['red_p50'])
    df['NDWI'] = (df['green_p50'] - df['nir_p50']) / (df['green_p50'] + df['nir_p50'])
    df['SAVI'] = (df['nir_p50'] - df['red_p50']) / (df['nir_p50'] + df['red_p50'] + 0.5) * 1.5
    df['EVI'] = 2.5 * (df['nir_p50'] - df['red_p50']) / (df['nir_p50'] + 6 * df['red_p50'] - 7.5 * df['blue_p50'] + 1)
    df['NDBI'] = (df['swir1_p50'] - df['nir_p50']) / (df['swir1_p50'] + df['nir_p50'])
    df['NDPI'] = (df['swir1_p50'] - df['blue_p50']) / (df['swir1_p50'] + df['blue_p50'])
    df['Composite_Index'] = df['API'] * df['NDVI'] * df['NDWI']

    # New indices
    df['GNDVI'] = (df['nir_p50'] - df['green_p50']) / (df['nir_p50'] + df['green_p50'])
    df['RENDVI'] = (df['re1_p50'] - df['re2_p50']) / (df['re1_p50'] + df['re2_p50'])
    df['PGI'] = (df['green_p50'] - df['blue_p50']) / (df['green_p50'] + df['blue_p50'])
    df['BSCI'] = (df['swir1_p50'] - df['blue_p50']) / (df['swir1_p50'] + df['blue_p50'])
    df['RVI'] = df['nir_p50'] / df['red_p50']
    return df

engineer_features(filtered_data)

X = filtered_data[features_new]
y = filtered_data[target]

# model = RandomForestClassifier(n_estimators=500, random_state=42,max_depth=40)
model = LGBMClassifier(random_state=42,n_estimators=300,num_leaves=83)

kf = KFold(n_splits=10, shuffle=True, random_state=42)


cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

# Print cross-validation results
print("Cross-Validation Accuracy Scores: ", cv_scores)
print("Mean Cross-Validation Accuracy: ", cv_scores.mean())
print("Standard Deviation of Cross-Validation Accuracy: ", cv_scores.std())

# Train the model on the entire dataset
model.fit(X, y)

# Save the model
# joblib.dump(model, 'plastic_cover_classifier.pkl')
# joblib.dump(label_encoder, 'label_encoder.pkl')

# Predict on a hold-out test set (if available) or new data
# y_pred = model.predict(X_test)  # Uncomment and replace X_test with actual test data if available
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


[LightGBM] [Info] Number of positive: 1324, number of negative: 1218
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000398 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5865
[LightGBM] [Info] Number of data points in the train set: 2542, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.520850 -> initscore=0.083447
[LightGBM] [Info] Start training from score 0.083447
[LightGBM] [Info] Number of positive: 1340, number of negative: 1202
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000371 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5865
[LightGBM] [Info] Number of data points in the train set: 2542, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.527144 -> initscore=0.108683
[LightGBM] 

In [3]:
import pandas as pd

def add_id_column(df, file_name):
  """Adds an ID column based on the file name and index."""
  file_prefix = file_name.split('_')[0]
  df['ID'] = df.index.map(lambda x: f"{file_prefix}_{x+1}")
  return df

t1 = pd.read_csv('Kenya_testing.csv')
t2 = pd.read_csv('Spain_validation.csv')
t3 = pd.read_csv('VNM_testing.csv')


test1 = add_id_column(t1, 'Kenya')  
test2 = add_id_column(t2, 'Spain')
test3 = add_id_column(t3, 'VNM')

t1 = pd.DataFrame(test1)
t2 = pd.DataFrame(test2)
t3 = pd.DataFrame(test3)



sample_submission = pd.read_csv('SampleSubmission.csv')
print(sample_submission.head())
test = pd.concat([t1, t2, t3], ignore_index=True, join='outer')
Test = pd.DataFrame(test)

merged_data = pd.merge(sample_submission, Test, on='ID', how='left')

merged_data=engineer_features(merged_data)
merged_data


           ID  TARGET
0     Kenya_1     NaN
1    Kenya_10     NaN
2   Kenya_100     NaN
3  Kenya_1000     NaN
4  Kenya_1001     NaN


Unnamed: 0,ID,TARGET
0,Kenya_1,1
1,Kenya_10,1
2,Kenya_100,1
3,Kenya_1000,2
4,Kenya_1001,2


In [7]:
merged_data.columns

Index(['ID', 'TARGET', 'lon', 'lat', 'blue_p50', 'green_p50', 'nir_p50',
       'nira_p50', 're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50',
       'swir2_p50', 'VV_p50', 'VH_p50', 'API', 'NDVI', 'NDWI', 'SAVI', 'EVI',
       'NDBI', 'NDPI', 'Composite_Index', 'GNDVI', 'RENDVI', 'PGI', 'BSCI',
       'RVI'],
      dtype='object')

In [11]:
testdata = merged_data.drop(columns=['ID','TARGET'])

test_pred = model.predict(testdata)

# Create submission DataFrame
submission_pred = pd.DataFrame({'ID': sample_submission['ID'], 'TARGET': test_pred})

# Save to CSV
submission_pred.to_csv('pred_submission_4_LGMC.csv', index=False)
submission_pred.head()

Unnamed: 0,ID,TARGET
0,Kenya_1,1
1,Kenya_10,1
2,Kenya_100,1
3,Kenya_1000,2
4,Kenya_1001,2


In [None]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures



# Select important features
important_features = data[['NDPI', 'API', 'Composite_Index', 'blue_p50', 'NDBI', 'green_p50', 're3_p50', 're2_p50', 'nira_p50', 'nir_p50']]

# Create polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_features = poly.fit_transform(important_features)

# Create a new DataFrame with polynomial features
poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names(important_features.columns))

# Calculate correlation with target
poly_df['TARGET'] = data['TARGET']
correlation_matrix = poly_df.corr()
target_correlation = correlation_matrix['TARGET'].sort_values(ascending=False)

print(target_correlation)


AttributeError: 'PolynomialFeatures' object has no attribute 'get_feature_names'