In [1]:
import numpy as np
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Imbalanced learn imports for handling class imbalance
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [2]:


class AdvancedSatelliteClassifier:
    def __init__(self, data_path='/home/carlos/Documents/fingerprint/features/cell_39'):
        """
        Advanced Satellite Classifier with improved techniques
        
        Args:
            data_path (str): Path to directory containing .npy files
        """
        self.data_path = data_path
        self.label_encoder = LabelEncoder()
        
    def load_data(self, start_idx=1, end_idx=4900):
        """
        Load and preprocess data from .npy files
        
        Returns:
            tuple: X (features), y (encoded labels)
        """
        X, y = [], []
        file_count = 0
        
        for idx in range(start_idx, end_idx + 1):
            try:
                npy_file = os.path.join(self.data_path, f'features_{idx}.npy')
                metadata_file = os.path.join(self.data_path, f'features_{idx}_metadata.json')
                
                # Skip if files don't exist
                if not (os.path.exists(npy_file) and os.path.exists(metadata_file)):
                    continue
                
                # Load features
                features = np.load(npy_file)
                
                # Load metadata
                with open(metadata_file, 'r') as f:
                    metadata = json.load(f)
                
                # Flatten the 2D array to 1D for classification
                X.append(features.flatten())
                y.append(metadata['Satellite_Label'])
                
                file_count += 1
            except Exception as e:
                print(f"Error processing file {idx}: {e}")
        
        # Convert to numpy arrays
        X = np.array(X)
        
        # Encode labels
        y_encoded = self.label_encoder.fit_transform(y)
        
        print(f"Loaded {file_count} feature matrices")
        print("Original Labels:", np.unique(y))
        print("Encoded Labels:", np.unique(y_encoded))
        
        return X, y_encoded
    
    def create_classification_pipelines(self):
        """
        Create multiple classification pipelines with different strategies
        
        Returns:
            dict: Classification pipelines
        """
        # Pipelines with SMOTE for handling class imbalance
        pipelines = {
            'Random Forest with SMOTE': ImbPipeline([
                ('scaler', StandardScaler()),
                ('smote', SMOTE(random_state=42)),
                ('classifier', RandomForestClassifier(
                    n_estimators=200, 
                    max_depth=10, 
                    min_samples_split=5, 
                    random_state=42
                ))
            ]),
            
            'Gradient Boosting with SMOTE': ImbPipeline([
                ('scaler', StandardScaler()),
                ('smote', SMOTE(random_state=42)),
                ('classifier', GradientBoostingClassifier(
                    n_estimators=200, 
                    learning_rate=0.1, 
                    max_depth=5, 
                    random_state=42
                ))
            ]),
            
            'SVM with SMOTE': ImbPipeline([
                ('scaler', StandardScaler()),
                ('smote', SMOTE(random_state=42)),
                ('classifier', SVC(
                    kernel='rbf', 
                    C=10, 
                    gamma='scale', 
                    random_state=42
                ))
            ])
        }
        
        return pipelines
    
    def train_and_evaluate(self, X, y, test_size=0.2):
        """
        Train and evaluate multiple classification pipelines
        
        Args:
            X (np.array): Features
            y (np.array): Encoded labels
            test_size (float): Proportion of test set
        
        Returns:
            dict: Performance metrics for each pipeline
        """
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42, stratify=y
        )
        
        # Create pipelines
        pipelines = self.create_classification_pipelines()
        
        # Store results
        results = {}
        
        # Evaluate each pipeline
        for name, pipeline in pipelines.items():
            try:
                # Perform cross-validation
                cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
                
                # Fit on training data
                pipeline.fit(X_train, y_train)
                
                # Predict on test data
                y_pred = pipeline.predict(X_test)
                
                # Decode predictions back to original labels
                original_y_test = self.label_encoder.inverse_transform(y_test)
                original_y_pred = self.label_encoder.inverse_transform(y_pred)
                
                # Store results
                results[name] = {
                    'accuracy': accuracy_score(y_test, y_pred),
                    'cross_val_scores': cv_scores,
                    'mean_cv_score': cv_scores.mean(),
                    'classification_report': classification_report(
                        original_y_test, original_y_pred
                    ),
                    'confusion_matrix': confusion_matrix(y_test, y_pred)
                }
                
                # Print results
                print(f"\n--- {name} Results ---")
                print(f"Accuracy: {results[name]['accuracy']:.4f}")
                print(f"Cross-validation Scores: {cv_scores}")
                print(f"Mean CV Score: {cv_scores.mean():.4f}")
                
            except Exception as e:
                print(f"Error with {name}: {e}")
        
        return results
    
    def plot_results(self, results):
        """
        Visualize results from different pipelines
        
        Args:
            results (dict): Performance results from train_and_evaluate
        """
        # Plot cross-validation scores
        plt.figure(figsize=(10, 6))
        plt.title("Cross-Validation Scores Comparison")
        cv_scores = [results[name]['cross_val_scores'] for name in results.keys()]
        plt.boxplot(cv_scores, labels=list(results.keys()))
        plt.ylabel("Accuracy")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        # Plot confusion matrices
        for name, result in results.items():
            plt.figure(figsize=(10, 8))
            sns.heatmap(result['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
            plt.title(f"Confusion Matrix - {name}")
            plt.xlabel("Predicted Label")
            plt.ylabel("True Label")
            plt.tight_layout()
            plt.show()

# Main execution
def main():
    try:
        # Initialize classifier
        classifier = AdvancedSatelliteClassifier()
        
        # Load data
        X, y = classifier.load_data()
        
        # Train and evaluate
        results = classifier.train_and_evaluate(X, y)
        
        # Plot results
        classifier.plot_results(results)
    
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the script
if __name__ == "__main__":
    main()

Loaded 4900 feature matrices
Original Labels: [  2   3   4   5   6   7   8   9  13  16  17  18  22  23  24  25  26  28
  29  30  33  36  38  39  40  42  43  44  46  48  49  50  51  57  65  67
  68  69  71  72  73  74  77  78  79  81  82  85  87  88  89  90  92  93
  94  96  99 103 104 107 109 110 111 112 114 115]
Encoded Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



--- Random Forest with SMOTE Results ---
Accuracy: 0.0378
Cross-validation Scores: [0.03316327 0.03954082 0.0255102  0.04209184 0.0497449 ]
Mean CV Score: 0.0380
