In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


if __name__ == "__main__":
    # Load your data (replace this with your actual data loading code)
    df_Kenya = pd.read_csv('Kenya_training.csv')
    df_Spain = pd.read_csv('Spain_training.csv')
    df_VNM = pd.read_csv('VNM_training.csv')
    
    # Assuming 'TARGET' is the name of your target column

In [None]:
    X_Kenya = df_Kenya.drop(['TARGET','ID'], axis=1)
    y_Kenya = df_Kenya['TARGET']
    
    
    X_Spain = df_Spain.drop(['TARGET','ID'], axis=1)
    y_Spain = df_Spain['TARGET']
    
    
    X_VNM = df_VNM.drop(['TARGET','ID'], axis=1)
    y_VNM = df_VNM['TARGET']
    
    label_encoder = LabelEncoder()
    y_Kenya = label_encoder.fit_transform(y_Kenya)
    y_Spain = label_encoder.fit_transform(y_Spain)
    y_VNM = label_encoder.fit_transform(y_VNM)
    

In [None]:
class VegetationIndexTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, nir_col='nir_p50', red_col='red_p50', blue_col='blue_p50'):
        self.nir_col = nir_col
        self.red_col = red_col
        self.blue_col = blue_col
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for col in [self.nir_col, self.red_col, self.blue_col]:
            X[col] = X[col] * 0.0001
        
        X['NDVI'] = (X[self.nir_col] - X[self.red_col]) / (X[self.nir_col] + X[self.red_col])
        X['EVI'] = 2.5 * ((X[self.nir_col] - X[self.red_col]) / 
                     (X[self.nir_col] + 6 * X[self.red_col] - 7.5 * X[self.blue_col] + 1))
        L = 0.5
        X['SAVI'] = ((X[self.nir_col] - X[self.red_col]) / 
                (X[self.nir_col] + X[self.red_col] + L)) * (1 + L)
        
        return X

def create_vegetation_index_pipeline():
    feature_columns = ['ID', 'Lon', 'Lat', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 
                       're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50', 
                       'VV_p50', 'VH_p50']
    
    vegetation_pipeline = Pipeline([
        ('veg_indices', VegetationIndexTransformer()),
        ('scaler', StandardScaler())
    ])
    
    return vegetation_pipeline
    

In [11]:
# Split the data into training and testing sets
X_train_Kenya, X_test_Kenya, y_train_Kenya, y_test_Kenya = train_test_split(X_Kenya, y_Kenya, test_size=0.2, random_state=42)

X_train_Spain, X_test_Spain, y_train_Spain, y_test_Spain = train_test_split(X_Spain, y_Spain, test_size=0.2, random_state=42)

X_train_VNM, X_test_VNM, y_train_VNM, y_test_VNM = train_test_split(X_VNM, y_VNM, test_size=0.2, random_state=42)

# Create the pipeline
feature_pipeline = create_vegetation_index_pipeline()



# Create a full pipeline including the model
full_pipeline = Pipeline([
    ('features', feature_pipeline),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])


# Fit the pipeline on the training data
# full_pipeline.fit(X_train_Kenya, y_train_Kenya)
full_pipeline.fit(X_train_Spain, y_train_Spain)
# full_pipeline.fit(X_train_VNM, y_train_VNM)

# Make predictions on the test set
# y_pred_Kenya = full_pipeline.predict(X_test_Kenya)
y_pred_Spain = full_pipeline.predict(X_test_Spain)
# y_pred_VNM = full_pipeline.predict(X_test_VNM)

In [12]:
# Calculate accuracy score
# accuracy_Kenya = accuracy_score(y_test_Kenya, y_pred_Kenya)
accuracy_Spain = accuracy_score(y_test_Spain, y_pred_Spain)
# accuracy_VNM = accuracy_score(y_test_VNM, y_pred_VNM)

# print(f"Kenya Accuracy: {accuracy_Kenya:.4f}")
print(f"Spain Accuracy: {accuracy_Spain:.4f}")
# print(f"VNM Accuracy: {accuracy_VNM:.4f}")

# Generate and print classification report
# class_report_Kenya = classification_report(y_test_Kenya, y_pred_Kenya, target_names=['Agricultural Plastic', 'Non-Agricultural Plastic'])
# print("\nClassification Report:")
# print(class_report_Kenya)

class_report_Spain = classification_report(y_test_Spain, y_pred_Spain, target_names=['Agricultural Plastic', 'Non-Agricultural Plastic'])
print("\nClassification Report:")
print(class_report_Spain)

# class_report_VNM = classification_report(y_test_VNM, y_pred_VNM, target_names=['Agricultural Plastic', 'Non-Agricultural Plastic'])
# print("\nClassification Report:")
# print(class_report_VNM)

Spain Accuracy: 0.9360

Classification Report:
                          precision    recall  f1-score   support

    Agricultural Plastic       0.93      0.92      0.93        75
Non-Agricultural Plastic       0.94      0.95      0.94        97

                accuracy                           0.94       172
               macro avg       0.94      0.93      0.93       172
            weighted avg       0.94      0.94      0.94       172



In [None]:

def add_id_column(df, file_name):
  """Adds an ID column based on the file name and index."""
  file_prefix = file_name.split('_')[0]
  df['ID'] = df.index.map(lambda x: f"{file_prefix}_{x+1}")
  return df

t1 = pd.read_csv('Kenya_testing.csv')
t2 = pd.read_csv('Spain_validation.csv')
t3 = pd.read_csv('VNM_testing.csv')


test1 = add_id_column(t1, 'Kenya')  
test2 = add_id_column(t2, 'Spain')
test3 = add_id_column(t3, 'VNM')

t1 = pd.DataFrame(test1)
t2 = pd.DataFrame(test2)
t3 = pd.DataFrame(test3)




sample_submission = pd.read_csv('SampleSubmission.csv')
print(sample_submission.head())
test = pd.concat([t1, t2, t3], ignore_index=True, join='outer')
Test = pd.DataFrame(test)

merged_data = pd.merge(sample_submission, Test, on='ID', how='left')


testdata = merged_data.drop(columns=['ID','TARGET'])

test_pred = full_pipeline.predict(testdata)

# Create submission DataFrame
submission_pred = pd.DataFrame({'ID': sample_submission['ID'], 'TARGET': test_pred})

# Save to CSV
submission_pred.to_csv('pred_submission.csv', index=False)

submission_pred.head()


In [None]:
# Get feature names after transformation
feature_names = X.columns.tolist() + ['NDVI', 'EVI', 'SAVI']


# Print feature importances
importances = full_pipeline.named_steps['classifier'].feature_importances_
feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importance = feature_importance.sort_values('importance', ascending=False).reset_index(drop=True)
print("\nTop 10 Feature Importances:")
print(feature_importance.head(10))



# Verify the number of features
print(f"\nNumber of features: {len(feature_names)}")
print(f"Number of importance values: {len(importances)}")