<a href="https://colab.research.google.com/github/fabio-baum/ia_para_engenheiros2/blob/main/Aula_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the dataset
from google.colab import files
uploaded = files.upload()

In [None]:
# Read the CSV file
df = pd.read_csv('Quantum_dot_data.csv')

In [None]:
# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

In [None]:
print("\nColumn names:")
print(df.columns.tolist())

In [None]:
print("\nCompound distribution:")
print(df['Compound'].value_counts())

In [None]:
# Data preprocessing
# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

In [None]:
# Handle missing values by filling with mean for numerical columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

In [None]:
# Prepare features and target
X = df.drop('Compound', axis=1)
y = df['Compound']

In [None]:
# Convert categorical target to numerical (binary classification: CdSe vs ZnSe)
y_binary = (y == 'CdSe').astype(int)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Logistic Regression
# Create and train Logistic Regression model
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

In [None]:
# Calculate metrics for Logistic Regression
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_cm = confusion_matrix(y_test, y_pred_lr)
lr_auc = roc_auc_score(y_test, y_pred_proba_lr)

In [None]:
# Calculate sensitivity and specificity
lr_tn, lr_fp, lr_fn, lr_tp = lr_cm.ravel()
lr_sensitivity = lr_tp / (lr_tp + lr_fn)  # Recall for positive class
lr_specificity = lr_tn / (lr_tn + lr_fp)  # Recall for negative class

In [None]:
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Sensitivity (Recall): {lr_sensitivity:.4f}")
print(f"Specificity: {lr_specificity:.4f}")
print(f"AUC: {lr_auc:.4f}")

In [None]:
print("\nConfusion Matrix:")
print(lr_cm)

In [None]:
# Linear Discriminant Analysis
# Create and train LDA model
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions
y_pred_lda = lda_model.predict(X_test_scaled)
y_pred_proba_lda = lda_model.predict_proba(X_test_scaled)[:, 1]

In [None]:
# Calculate metrics for LDA
lda_accuracy = accuracy_score(y_test, y_pred_lda)
lda_cm = confusion_matrix(y_test, y_pred_lda)
lda_auc = roc_auc_score(y_test, y_pred_proba_lda)

In [None]:
# Calculate sensitivity and specificity
lda_tn, lda_fp, lda_fn, lda_tp = lda_cm.ravel()
lda_sensitivity = lda_tp / (lda_tp + lda_fn)
lda_specificity = lda_tn / (lda_tn + lda_fp)

In [None]:
print(f"Accuracy: {lda_accuracy:.4f}")
print(f"Sensitivity (Recall): {lda_sensitivity:.4f}")
print(f"Specificity: {lda_specificity:.4f}")
print(f"AUC: {lda_auc:.4f}")

In [None]:
print("\nConfusion Matrix:")
print(lda_cm)

In [None]:
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Linear Discriminant Analysis'],
    'Accuracy': [lr_accuracy, lda_accuracy],
    'Sensitivity': [lr_sensitivity, lda_sensitivity],
    'Specificity': [lr_specificity, lda_specificity],
    'AUC': [lr_auc, lda_auc]
})

In [None]:
print(comparison_df)

In [None]:
# Plot ROC curves
plt.figure(figsize=(10, 8))

# Calculate ROC curves
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_proba_lr)
fpr_lda, tpr_lda, _ = roc_curve(y_test, y_pred_proba_lda)

plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {lr_auc:.3f})', linewidth=2)
plt.plot(fpr_lda, tpr_lda, label=f'LDA (AUC = {lda_auc:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Compound Classification')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Plot confusion matrices side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Logistic Regression confusion matrix
sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues', ax=ax1)
ax1.set_title('Logistic Regression\nConfusion Matrix')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')
ax1.set_xticklabels(['ZnSe', 'CdSe'])
ax1.set_yticklabels(['ZnSe', 'CdSe'])

# LDA confusion matrix
sns.heatmap(lda_cm, annot=True, fmt='d', cmap='Blues', ax=ax2)
ax2.set_title('Linear Discriminant Analysis\nConfusion Matrix')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')
ax2.set_xticklabels(['ZnSe', 'CdSe'])
ax2.set_yticklabels(['ZnSe', 'CdSe'])

plt.tight_layout()
plt.show()