In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# class VegetationIndexTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, nir_col='nir_p50', red_col='red_p50', blue_col='blue_p50'):
#         self.nir_col = nir_col
#         self.red_col = red_col
#         self.blue_col = blue_col
    
#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X):
#         X = X.copy()
#         for col in [self.nir_col, self.red_col, self.blue_col]:
#             X[col] = X[col] * 0.0001
        
#         X['NDVI'] = (X[self.nir_col] - X[self.red_col]) / (X[self.nir_col] + X[self.red_col])
#         X['EVI'] = 2.5 * ((X[self.nir_col] - X[self.red_col]) / 
#                      (X[self.nir_col] + 6 * X[self.red_col] - 7.5 * X[self.blue_col] + 1))
#         L = 0.5
#         X['SAVI'] = ((X[self.nir_col] - X[self.red_col]) / 
#                 (X[self.nir_col] + X[self.red_col] + L)) * (1 + L)
        
#         return X

# def create_vegetation_index_pipeline():
#     vegetation_pipeline = Pipeline([
#         ('veg_indices', VegetationIndexTransformer()),
#         ('scaler', StandardScaler())
#     ])
    
#     return vegetation_pipeline

class HierarchicalModel:
    def __init__(self):
        self.location_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        self.location_specific_models = {}
        self.feature_pipeline = create_vegetation_index_pipeline()
        self.location_encoder = LabelEncoder()
        self.target_encoder = LabelEncoder()
    
    def fit(self, X, y, locations):
        # Encode locations and target
        encoded_locations = self.location_encoder.fit_transform(locations)
        encoded_target = self.target_encoder.fit_transform(y)
        
        # Prepare features
        X_transformed = self.feature_pipeline.fit_transform(X)
        
        # Train location classifier
        self.location_classifier.fit(X_transformed, encoded_locations)
        
        # Train location-specific models
        for location in np.unique(locations):
            mask = locations == location
            X_loc = X_transformed[mask]
            y_loc = encoded_target[mask]
            model = RandomForestClassifier(n_estimators=100, random_state=42)
            model.fit(X_loc, y_loc)
            self.location_specific_models[location] = model
    
    def predict(self, X):
        X_transformed = self.feature_pipeline.transform(X)
        predicted_locations = self.location_classifier.predict(X_transformed)
        decoded_locations = self.location_encoder.inverse_transform(predicted_locations)
        
        predictions = []
        for i, loc in enumerate(decoded_locations):
            model = self.location_specific_models[loc]
            pred = model.predict(X_transformed[i].reshape(1, -1))
            predictions.append(pred[0])
        
        return self.target_encoder.inverse_transform(predictions)

# Main execution
if __name__ == "__main__":
    # Load your data (replace this with your actual data loading code)
    data = pd.read_csv('your_data.csv')
    
    # Assuming 'TARGET' is the name of your target column and 'LOCATION' is the location column
    X = data.drop(['TARGET', 'LOCATION'], axis=1)
    y = data['TARGET']
    locations = data['LOCATION']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test, loc_train, loc_test = train_test_split(
        X, y, locations, test_size=0.2, random_state=42, stratify=locations
    )
    
    # Create and train the hierarchical model
    hier_model = HierarchicalModel()
    hier_model.fit(X_train, y_train, loc_train)
    
    # Make predictions
    y_pred = hier_model.predict(X_test)
    
    # Calculate overall accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {accuracy:.4f}")
    
    # Generate and print overall classification report
    class_report = classification_report(y_test, y_pred, target_names=hier_model.target_encoder.classes_)
    print("\nOverall Classification Report:")
    print(class_report)
    
    # Calculate and print accuracy for each location
    for location in np.unique(locations):
        mask = loc_test == location
        loc_accuracy = accuracy_score(y_test[mask], y_pred[mask])
        print(f"\nAccuracy for {location}: {loc_accuracy:.4f}")
        
        loc_report = classification_report(y_test[mask], y_pred[mask], 
                                           target_names=hier_model.target_encoder.classes_)
        print(f"Classification Report for {location}:")
        print(loc_report)