In [None]:

import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets, decomposition, manifold, metrics, model_selection, neighbors, linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import umap
from google.colab import drive

# Montar o Google Drive
drive.mount('/content/drive')
drive_folder = '/content/drive/MyDrive/colab_results'
os.makedirs(drive_folder, exist_ok=True)

# Carregar MNIST
mnist = datasets.fetch_openml('mnist_784', version=1)
X = mnist.data / 255.0
y = mnist.target.astype(int)

# Reduzir tamanho para teste rápido
X_small, _, y_small, _ = model_selection.train_test_split(X, y, train_size=10000, stratify=y, random_state=42)

# Definir métodos de redução
dimensionality_methods = {
    'raw': X_small,
    'PCA': decomposition.PCA(n_components=50).fit_transform(X_small),
    't-SNE': manifold.TSNE(n_components=2, random_state=42).fit_transform(X_small),
    'UMAP': umap.UMAP(n_components=2, random_state=42).fit_transform(X_small),
}

# Modelos
def get_models():
    return {
        'kNN': neighbors.KNeighborsClassifier(n_neighbors=3, metric='euclidean'),
        'Linear': linear_model.LogisticRegression(max_iter=1000)
    }

# Guardar resultados
results = []

# Avaliar cada combinação
for dim_name, X_reduced in dimensionality_methods.items():
    for model_name, model in get_models().items():
        start_time = time.time()
        scores = model_selection.cross_val_score(model, X_reduced, y_small, cv=5, scoring='accuracy')
        elapsed = time.time() - start_time
        
        results.append({
            'Dimensionality': dim_name,
            'Model': model_name,
            'Accuracy Mean': np.mean(scores),
            'Accuracy Std': np.std(scores),
            'Time (s)': elapsed
        })
        
        # Treinar para matriz de confusão
        X_train, X_test, y_train, y_test = model_selection.train_test_split(X_reduced, y_small, test_size=0.2, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(cmap='Blues', xticks_rotation=45)
        plt.title(f"Confusion Matrix: {model_name} + {dim_name}")
        plt.tight_layout()
        plt.savefig(os.path.join(drive_folder, f'confusion_{model_name}_{dim_name}.png'), dpi=300)
        plt.close()

# Criar DataFrame
results_df = pd.DataFrame(results)
results_df['Accuracy %'] = results_df['Accuracy Mean'] * 100

# Gráfico 1: Acurácia
plt.figure(figsize=(10, 6))
sns.barplot(data=results_df, x='Dimensionality', y='Accuracy %', hue='Model')
plt.title('Accuracy by Dimensionality Reduction and Model')
plt.ylim(80, 100)
plt.tight_layout()
plt.savefig(os.path.join(drive_folder, 'accuracy_comparison.png'), dpi=300)
plt.close()

# Gráfico 2: Tempo
plt.figure(figsize=(10, 6))
sns.barplot(data=results_df, x='Dimensionality', y='Time (s)', hue='Model')
plt.title('Execution Time by Dimensionality Reduction and Model')
plt.tight_layout()
plt.savefig(os.path.join(drive_folder, 'time_comparison.png'), dpi=300)
plt.close()

# Gráfico 3: Acurácia com desvio padrão
plt.figure(figsize=(10, 6))
for model in results_df['Model'].unique():
    subset = results_df[results_df['Model'] == model]
    plt.errorbar(subset['Dimensionality'], subset['Accuracy %'], yerr=subset['Accuracy Std']*100, label=model, capsize=5, marker='o')
plt.title('Accuracy with Standard Deviation')
plt.ylabel('Accuracy (%)')
plt.ylim(80, 100)
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(drive_folder, 'accuracy_errorbar.png'), dpi=300)
plt.close()

print(f'✅ Todos os gráficos e matrizes de confusão foram salvos em {drive_folder}')
