In [None]:
# Refactored Code with Improved Structure and Cleaned-Up Components

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, f1_score, matthews_corrcoef, average_precision_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from joblib import Parallel, delayed
from imblearn.over_sampling import SMOTE, SMOTEN
from sklearn.preprocessing import LabelEncoder
import os

# Initialize LabelEncoder
le = LabelEncoder()

# Constants (Replace with configuration-based paths)
DATA_PATH = "path/to/data_directory"
OUTPUT_PATH = "path/to/output_directory"

# Function for processing city-year data
def process_city_year(city, start_year, end_year):
    """
    Processes data for a specific city and year range, preparing training and test sets.
    
    Parameters:
    - city (str): The city name.
    - start_year (int): The starting year.
    - end_year (int): The ending year.

    Returns:
    - tuple: Processed data including city, years, training and test data, or None if an error occurs.
    """
    try:
        # Construct file paths
        new_format_start = f"{city.lower()}_cd_{start_year}_tech"
        new_format_end = f"{city.lower()}_cd_{end_year}_tech"
        
        file_path_start = os.path.join(DATA_PATH, f"{new_format_start}.csv")
        file_path_end = os.path.join(DATA_PATH, f"{new_format_end}.csv")

        # Read data files
        df_start = pd.read_csv(file_path_start)
        df_end = pd.read_csv(file_path_end)

        # Process column names and binary encoding
        new_column_names = [f'IPC{i+1}' for i in range(len(df_start.columns))]
        df_start.columns, df_end.columns = new_column_names, new_column_names

        df_train = df_start.applymap(lambda x: 1 if x > 0 else 0)
        df_test = df_end.applymap(lambda x: 1 if x > 0 else 0)

        # Split data
        X, Y = df_train.T.values[:, 1:50], df_test.T.values[:, 0]
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7)

        # Apply SMOTE
        smote = SMOTEN(random_state=7)
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        
        # Label encoding
        y_train_smote = le.fit_transform(y_train_smote)
        y_test = le.transform(y_test)

        return city, start_year, end_year, X_train_smote, X_test, y_train_smote, y_test
    
    except FileNotFoundError:
        print(f"File not found for {city} in years {start_year} or {end_year}. Skipping...")
        return None
    except Exception as e:
        print(f"An error occurred for {city} in years {start_year} and {end_year}: {e}")
        return None

# Function for hyperparameter tuning
def tune_hyperparameters(data, param_grid):
    """
    Tunes hyperparameters for a RandomForestClassifier using GridSearchCV.

    Parameters:
    - data (tuple): Processed data including city, years, and train/test splits.
    - param_grid (dict): Hyperparameter grid for RandomForestClassifier.

    Returns:
    - dict: Model performance metrics and best parameters.
    """
    try:
        city, start_year, end_year, X_train_smote, X_test, y_train_smote, y_test = data

        model = RandomForestClassifier(random_state=7)
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted', n_jobs=4)
        grid_search.fit(X_train_smote, y_train_smote)

        best_param = grid_search.best_params_
        
        # Train with best parameters
        model = RandomForestClassifier(**best_param, random_state=7)
        model.fit(X_train_smote, y_train_smote)
        y_pred = model.predict(X_test)

        # Metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted')
        matthews = matthews_corrcoef(y_test, y_pred)
        pr_auc = average_precision_score(y_test, model.predict_proba(X_test)[:, 1])

        # Result dictionary
        result = {
            'city': city, 'start_year': start_year, 'end_year': end_year,
            'accuracy': accuracy, 'precision': precision, 'f1_score': f1,
            'matthews_corrcoef': matthews, 'classification_error': 1 - accuracy,
            'pr_auc': pr_auc, **best_param
        }
        
        return result

    except Exception as e:
        print(f"An error occurred during hyperparameter tuning for {city} in years {start_year} and {end_year}: {e}")
        return None

# Main processing routine
def main(cities, years, param_grid):
    """
    Main routine for processing city-year data, hyperparameter tuning, and saving results.

    Parameters:
    - cities (list): List of cities.
    - years (range): Range of years to consider.
    - param_grid (dict): Hyperparameter grid for RandomForestClassifier.
    """
    # Process data in parallel
    preprocessed_data = Parallel(n_jobs=8, verbose=10)(
        delayed(process_city_year)(city, start_year, start_year + 5) for start_year in years for city in cities
    )
    preprocessed_data = [data for data in preprocessed_data if data is not None]

    # Hyperparameter tuning in parallel
    results = Parallel(n_jobs=8, verbose=10)(delayed(tune_hyperparameters)(data, param_grid) for data in preprocessed_data)
    results = [result for result in results if result is not None]
    
    # Save results
    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(OUTPUT_PATH, 'results.csv'), index=False)

    # Aggregate and save average results
    avg_results_df = results_df.groupby('city').agg({
        'n_estimators': 'mean', 'max_depth': 'mean',
        'min_samples_split': 'mean', 'min_samples_leaf': 'mean',
        'bootstrap': lambda x: x.mode()[0]  # Mode as the most frequent value
    }).reset_index()
    
    avg_results_df.to_csv(os.path.join(OUTPUT_PATH, 'avg_results.csv'), index=False)

# Define the hyperparameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'class_weight': ['balanced']
}

# Uncomment to run with actual data:
# main(cities=['City1', 'City2'], years=range(2000, 2009), param_grid=param_grid)
