<a href="https://colab.research.google.com/github/fabio-baum/ia_para_engenheiros2/blob/main/Aula_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install and import necessary libraries
!pip install pandas numpy matplotlib seaborn scikit-learn openpyxl

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
import io

print("Libraries imported successfully!")

In [None]:
# Step 2: Upload and load the Excel file
from google.colab import files
uploaded = files.upload()

# Load the Excel file
file_name = list(uploaded.keys())[0]
df = pd.read_excel(file_name, sheet_name='Database-CdSe-V')

print("File loaded successfully!")
print(f"Dataset shape: {df.shape}")

In [None]:
# Step 3: Explore the dataset
print("Dataset Info:")
print(df.info())

print("\nFirst 5 rows:")
print(df.head())

print("\nDataset columns:")
print(df.columns.tolist())

In [None]:
# Step 4: Check for missing values and data types
print("Missing values in each column:")
print(df.isnull().sum())

print("\nData types:")
print(df.dtypes)

In [None]:
# Step 5: Data preprocessing - Handle missing values and select numerical columns
# Drop columns with too many missing values or non-numeric data
df_clean = df.copy()

# Remove columns with all missing values or mostly text
columns_to_drop = []
for col in df_clean.columns:
    if df_clean[col].isnull().sum() > len(df_clean) * 0.5:  # Drop if more than 50% missing
        columns_to_drop.append(col)
    elif df_clean[col].dtype == 'object' and col != 'Citation':
        # Check if column contains mostly text (not convertible to numeric)
        try:
            pd.to_numeric(df_clean[col], errors='raise')
        except:
            columns_to_drop.append(col)

df_clean = df_clean.drop(columns=columns_to_drop)
print(f"Dropped columns: {columns_to_drop}")

# Convert remaining columns to numeric, coercing errors to NaN
for col in df_clean.columns:
    if col != 'Citation':
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Drop rows where target variable (Diameter_nm) is missing
df_clean = df_clean.dropna(subset=['Diameter_nm'])

print(f"Cleaned dataset shape: {df_clean.shape}")

In [None]:
# Step 6: Separate features and target variable
# Target variable: Diameter_nm
target = 'Diameter_nm'

# Remove non-feature columns
non_feature_cols = ['Citation']  # Add other non-feature columns if needed
features = [col for col in df_clean.columns if col not in [target] + non_feature_cols]

print("Features to use:", features)
print(f"Number of features: {len(features)}")

X = df_clean[features]
y = df_clean[target]

# Handle missing values in features by imputing with mean
X = X.fillna(X.mean())

print(f"Final feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")

In [None]:
# Step 7: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

In [None]:
# Step 8: SIMPLE LINEAR REGRESSION - Using single most correlated feature
# Find the feature most correlated with diameter
correlations = X.corrwith(y).abs().sort_values(ascending=False)
best_feature = correlations.index[0]

print("Top 5 features correlated with Diameter_nm:")
print(correlations.head())

print(f"\nSelected feature for simple linear regression: {best_feature}")

In [None]:
# Step 9: Simple Linear Regression Model
# Prepare data for simple linear regression
X_simple_train = X_train[[best_feature]]
X_simple_test = X_test[[best_feature]]

# Create and train the model
simple_model = LinearRegression()
simple_model.fit(X_simple_train, y_train)

# Make predictions
y_pred_simple = simple_model.predict(X_simple_test)

print("Simple Linear Regression Results:")
print(f"Coefficient: {simple_model.coef_[0]:.4f}")
print(f"Intercept: {simple_model.intercept_:.4f}")

In [None]:
# Step 10: Evaluate Simple Linear Regression
def calculate_metrics(y_true, y_pred, model_name):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    metrics = {
        'R2': r2,
        'MAE': mae,
        'RMSE': rmse,
        'MAPE': mape
    }

    print(f"\n{model_name} Metrics:")
    print(f"R² Score: {r2:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAPE: {mape:.2f}%")

    return metrics

simple_metrics = calculate_metrics(y_test, y_pred_simple, "Simple Linear Regression")

In [None]:
# Step 11: Visualize Simple Linear Regression Results
plt.figure(figsize=(15, 5))

# Plot 1: Actual vs Predicted
plt.subplot(1, 3, 1)
plt.scatter(y_test, y_pred_simple, alpha=0.6)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
plt.xlabel('Actual Diameter (nm)')
plt.ylabel('Predicted Diameter (nm)')
plt.title('Simple Linear Regression: Actual vs Predicted')

# Plot 2: Residuals
plt.subplot(1, 3, 2)
residuals = y_test - y_pred_simple
plt.scatter(y_pred_simple, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')

# Plot 3: Feature vs Target
plt.subplot(1, 3, 3)
plt.scatter(X_simple_test[best_feature], y_test, alpha=0.6, label='Actual')
plt.scatter(X_simple_test[best_feature], y_pred_simple, alpha=0.6, label='Predicted')
plt.xlabel(best_feature)
plt.ylabel('Diameter (nm)')
plt.legend()
plt.title(f'{best_feature} vs Diameter')

plt.tight_layout()
plt.show()

In [None]:
# Step 12: MULTIPLE LINEAR REGRESSION
# Scale the features for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train multiple linear regression model
multi_model = LinearRegression()
multi_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_multi = multi_model.predict(X_test_scaled)

print("Multiple Linear Regression Model Trained!")
print(f"Number of features used: {len(features)}")

In [None]:
# Step 13: Evaluate Multiple Linear Regression
multi_metrics = calculate_metrics(y_test, y_pred_multi, "Multiple Linear Regression")

In [None]:
# Step 14: Compare Model Performance
metrics_comparison = pd.DataFrame({
    'Simple Linear Regression': simple_metrics,
    'Multiple Linear Regression': multi_metrics
})

print("Model Performance Comparison:")
print(metrics_comparison)

In [None]:
# Step 15: Visualize Multiple Linear Regression Results
plt.figure(figsize=(15, 5))

# Plot 1: Actual vs Predicted
plt.subplot(1, 3, 1)
plt.scatter(y_test, y_pred_multi, alpha=0.6)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
plt.xlabel('Actual Diameter (nm)')
plt.ylabel('Predicted Diameter (nm)')
plt.title('Multiple Linear Regression: Actual vs Predicted')

# Plot 2: Residuals
plt.subplot(1, 3, 2)
residuals_multi = y_test - y_pred_multi
plt.scatter(y_pred_multi, residuals_multi, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')

# Plot 3: Comparison of both models
plt.subplot(1, 3, 3)
models = ['Simple LR', 'Multiple LR']
r2_scores = [simple_metrics['R2'], multi_metrics['R2']]
mae_scores = [simple_metrics['MAE'], multi_metrics['MAE']]

x = np.arange(len(models))
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, r2_scores, width, label='R²', alpha=0.7)
rects2 = ax.bar(x + width/2, mae_scores, width, label='MAE', alpha=0.7)

ax.set_ylabel('Scores')
ax.set_title('Model Comparison: R² and MAE')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# Step 16: Feature Importance Analysis
feature_importance = pd.DataFrame({
    'Feature': features,
    'Coefficient': multi_model.coef_,
    'Absolute_Coefficient': np.abs(multi_model.coef_)
})

feature_importance = feature_importance.sort_values('Absolute_Coefficient', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

In [None]:
# Step 17: Visualize Feature Importance
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(10)

plt.subplot(2, 1, 1)
plt.barh(top_features['Feature'], top_features['Coefficient'])
plt.xlabel('Coefficient Value')
plt.title('Top 10 Feature Coefficients')
plt.gca().invert_yaxis()

plt.subplot(2, 1, 2)
plt.barh(top_features['Feature'], top_features['Absolute_Coefficient'])
plt.xlabel('Absolute Coefficient Value')
plt.title('Top 10 Feature Importance (Absolute Coefficients)')
plt.gca().invert_yaxis()

plt.tight_layout()
plt.show()

In [None]:
# Step 18: Detailed Error Analysis
print("Detailed Error Analysis:")

# Create comparison dataframe
results_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted_Simple': y_pred_simple,
    'Predicted_Multiple': y_pred_multi
})

results_df['Error_Simple'] = results_df['Actual'] - results_df['Predicted_Simple']
results_df['Error_Multiple'] = results_df['Actual'] - results_df['Predicted_Multiple']
results_df['Absolute_Error_Simple'] = np.abs(results_df['Error_Simple'])
results_df['Absolute_Error_Multiple'] = np.abs(results_df['Error_Multiple'])

print("\nSample of Predictions and Errors:")
print(results_df.head(10))

print(f"\nSummary Statistics for Errors:")
print("Simple Linear Regression:")
print(f"Mean Absolute Error: {results_df['Absolute_Error_Simple'].mean():.4f}")
print(f"Std of Errors: {results_df['Error_Simple'].std():.4f}")

print("\nMultiple Linear Regression:")
print(f"Mean Absolute Error: {results_df['Absolute_Error_Multiple'].mean():.4f}")
print(f"Std of Errors: {results_df['Error_Multiple'].std():.4f}")

In [None]:
# Step 19: Final Summary
print("="*60)
print("FINAL REGRESSION ANALYSIS SUMMARY")
print("="*60)

print(f"\nDataset Overview:")
print(f"Total samples: {len(df)}")
print(f"Features used: {len(features)}")
print(f"Target variable: Diameter_nm")

print(f"\nBest Single Feature: {best_feature}")
print(f"Correlation with target: {correlations[best_feature]:.4f}")

print(f"\nPerformance Summary:")
print(f"{'Metric':<15} {'Simple LR':<15} {'Multiple LR':<15} {'Improvement':<15}")
print("-" * 60)
for metric in ['R2', 'MAE', 'RMSE', 'MAPE']:
    simple_val = simple_metrics[metric]
    multi_val = multi_metrics[metric]
    improvement = ((simple_val - multi_val) / simple_val * 100) if metric != 'R2' else ((multi_val - simple_val) / simple_val * 100)

    if metric == 'R2':
        print(f"{metric:<15} {simple_val:<15.4f} {multi_val:<15.4f} {improvement:>13.1f}%")
    else:
        print(f"{metric:<15} {simple_val:<15.4f} {multi_val:<15.4f} {improvement:>13.1f}%")

print("\nConclusion:")
if multi_metrics['R2'] > simple_metrics['R2']:
    print("✓ Multiple Linear Regression performs better than Simple Linear Regression")
else:
    print("○ Simple Linear Regression performs similarly or better than Multiple Linear Regression")

print(f"\nThe best model explains {max(simple_metrics['R2'], multi_metrics['R2'])*100:.1f}% of the variance in nanoparticle diameter.")