In [None]:
%%time
import os
import subprocess
import pandas as pd
import shutil
from shutil import copyfile
from iGAM import run_iGAM
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from interpret.glassbox import ExplainableBoostingRegressor
from interpret import show
from interpret.perf import RegressionPerf
from tqdm import tqdm
from shutil import copyfile
import numpy as np
import matplotlib.pyplot as plt
import pickle
import psutil
import ast
import time
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
from scipy.stats import pearsonr
from minepy import MINE
from matplotlib.lines import Line2D
import matplotlib
import matplotlib.patches
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import logging
#Setup logging
logging.basicConfig(level=logging.INFO)
set_visualize_provider(InlineProvider())
get_ipython().run_line_magic('config', 'IPCompleter.greedy=True')
#from IPython.display import display, SVG

def create_and_copy_data(folder_name, data, split=None):
    """Create directory, split data and copy files."""
    folder_path = os.path.join(".", folder_name)
    os.makedirs(folder_path, exist_ok=True)
    
    if split:
        train, test = split
        train.to_csv(os.path.join(folder_path, "stratified_train_sets.csv"), index=False)
        test.to_csv(os.path.join(folder_path, "stratified_test_sets.csv"), index=False)
        copyfile(data_path, os.path.join(folder_path, "Total_set.csv"))
    else:
        copyfile(data_path, os.path.join(folder_path, "Total_set.csv"))

# Load data and prepare splits
data_path = os.path.join(os.getcwd(), "Total_set.csv")
data = pd.read_csv(data_path)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stratified_splits = list(skf.split(data, data['Bulk']))

# Folder names and their corresponding splits
folder_splits = {"Random_CV": None}
folder_splits.update({f"Stratified_NCV_{i+1}": (data.iloc[train_idx], data.iloc[test_idx])
                      for i, (train_idx, test_idx) in enumerate(stratified_splits)})

# Create directories, split data, and copy files
for folder_name, split in folder_splits.items():
    create_and_copy_data(folder_name, data, split)

# Run the external script in each subdirectory
original_dir = os.getcwd()
#iGAM_notebook_path = os.path.join(original_dir, "iGAM.ipynb")

for subdir in folder_splits.keys():
    print(f"Processing directory: {subdir}")
    #copyfile(iGAM_notebook_path, os.path.join(subdir, "iGAM.ipynb"))
    os.chdir(subdir)
    # Run your external iGAM.ipynb notebook
    try:
        run_iGAM()
        
    except subprocess.CalledProcessError as e:
        print(f"Error running notebook in {subdir}: {e}")
        print(e.output.decode("utf-8"))  # Print the error output
    os.chdir(original_dir)

    
# post-treatment of data
def extract_mae_values(filepath):
    with open(filepath, "r") as file:
        lines = file.readlines()
        train_mae = float(lines[4].split(":")[1].strip())
        test_mae = float(lines[5].split(":")[1].strip())
    return train_mae, test_mae

def extract_hyperparameters(filepath):
    """Safely extract hyperparameters from the model performance file."""
    with open(filepath, "r") as file:
        lines = file.readlines()
        best_hyper_line = [line for line in lines if "Best hyperparameters:" in line][0]
        # Extracting only the dictionary part
        dict_content = best_hyper_line.split("Best hyperparameters:")[1].strip()
        try:
            best_hyper = ast.literal_eval(dict_content)
            return best_hyper
        except Exception as e:
            print(f"Error parsing hyperparameters from {filepath}: {e}")
            return {}

# List all subdirectories in the current working directory
subdirectories = [d for d in os.listdir('.') if os.path.isdir(d)]

bagging_models_dir = f"{original_dir}/bagging_models"
os.makedirs(bagging_models_dir, exist_ok=True)

# Iterating through the subdirectories and extracting MAE values
NCV_train_maes = []
NCV_test_maes =[]
performance_data = {}
best_hyper = {}
            
for subdir in subdirectories:
    performance_file = os.path.join(subdir, "results", "best_model_performance", "model_performance.txt")
    model_file = os.path.join(subdir, "results", "best_model_performance", "best_model.pkl")
    if os.path.exists(performance_file):  # Check if the file exists
        train_mae, test_mae = extract_mae_values(performance_file)
        performance_data[subdir] = {"train_mae": train_mae, "test_mae": test_mae}
    if "Stratified" in subdir and os.path.isfile(model_file):
        print(f"{model_file} exists")
        NCV_train_maes.append(train_mae)
        NCV_test_maes.append(test_mae)
        target_path = os.path.join(bagging_models_dir, os.path.basename(subdir))
        os.makedirs(target_path, exist_ok=True)
        shutil.copy(model_file, target_path)
        # Check if current test_mae is the minimum in NCV_test_maes
        if test_mae == min(NCV_test_maes):
            best_hyper = extract_hyperparameters(performance_file)

print(f"The best hyperparameters 5-fold Nested CVs is {best_hyper}")

# Train and save the best and final model based on the whole dataset
final_model_dir = f"{original_dir}/final_model_dir"
os.makedirs(final_model_dir, exist_ok=True)

# Calculating average values for train_maes and test_maes
average_NCV_train_mae = sum(NCV_train_maes) / len(NCV_train_maes)
average_NCV_test_mae = sum(NCV_test_maes) / len(NCV_test_maes)
print(f"average_NCV_train_mae is {average_NCV_train_mae}")
print(f"average_NCV_test_mae is {average_NCV_test_mae}")

# Define the desired order for the subdirectories
desired_order = ["Random_CV", 
                 "Stratified_NCV_1", 
                 "Stratified_NCV_2", 
                 "Stratified_NCV_3", 
                 "Stratified_NCV_4", 
                 "Stratified_NCV_5"]

# Extracting ordered train and test MAE values from the performance_data dictionary
ordered_train_maes = [performance_data[subdir]["train_mae"] if subdir in performance_data else np.nan for subdir in desired_order]
ordered_test_maes = [performance_data[subdir]["test_mae"] if subdir in performance_data else np.nan for subdir in desired_order]

# Plotting the MAE values
x = desired_order

# Define the width for the bar plots
width = 0.4

# Create positions for each bar
ind = np.arange(len(x))

# Plot the MAE values using the desired order
plt.bar(ind - width/2, ordered_train_maes, width=width, label='Train MAE', align='center')
plt.bar(ind + width/2, ordered_test_maes, width=width, label='Test MAE', align='center')

# Adding dashed lines for average NCV MAE values
plt.axhline(average_NCV_train_mae, color='blue', linestyle='--', label=f'Average NCV Train MAE: {average_NCV_train_mae:.2f}')
plt.axhline(average_NCV_test_mae, color='orange', linestyle='--', label=f'Average NCV Test MAE: {average_NCV_test_mae:.2f}')

plt.xlabel('Samplings and CVs')
plt.ylabel('MAE (eV)')
plt.title('MAE performance across different samplings and CVs')
plt.legend()
plt.xticks(ind, x, rotation=45)
plt.tight_layout()

# Save the plots
plt.savefig('ordered_performance_graph_600dpi.png', dpi=600)
plt.savefig('ordered_performance_graph.svg', format='svg')
plt.savefig('ordered_performance_graph.pdf', format='pdf')
plt.show()
