# Iris Dataset Exploration

This notebook provides an in-depth exploration of the Iris dataset, including:
- Data loading and basic statistics
- Feature distributions and visualizations
- Correlation analysis
- Model comparison
- Feature importance analysis

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Data Loading and Basic Information

In [None]:
# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names

# Create a DataFrame for easier manipulation
df = pd.DataFrame(X, columns=feature_names)
df['species'] = [target_names[i] for i in y]
df['target'] = y

print("Dataset Shape:", df.shape)
print("\nFeature Names:", feature_names)
print("\nTarget Classes:", target_names)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Basic statistics
print("Dataset Info:")
print(f"Total samples: {len(df)}")
print(f"Features: {len(feature_names)}")
print(f"Classes: {len(target_names)}")

print("\nClass distribution:")
print(df['species'].value_counts())

print("\nMissing values:")
print(df.isnull().sum())

print("\nBasic statistics:")
df.describe()

## 2. Data Visualization

In [None]:
# Feature distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Feature Distributions by Species', fontsize=16, fontweight='bold')

for i, feature in enumerate(feature_names):
    row = i // 2
    col = i % 2
    
    # Histogram with KDE
    for species in target_names:
        data = df[df['species'] == species][feature]
        axes[row, col].hist(data, alpha=0.6, label=species, bins=15)
    
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Frequency')
    axes[row, col].legend()
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Box plots for feature distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Feature Box Plots by Species', fontsize=16, fontweight='bold')

for i, feature in enumerate(feature_names):
    row = i // 2
    col = i % 2
    
    sns.boxplot(data=df, x='species', y=feature, ax=axes[row, col])
    axes[row, col].set_title(f'{feature} Distribution')
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = df[feature_names].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.3f', cbar_kws={'shrink': 0.8})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("Correlation Matrix:")
print(correlation_matrix)

In [None]:
# Pairplot for feature relationships
plt.figure(figsize=(12, 10))
sns.pairplot(df, hue='species', diag_kind='hist', height=2.5)
plt.suptitle('Feature Pair Relationships', y=1.02, fontsize=16, fontweight='bold')
plt.show()

## 3. Principal Component Analysis (PCA)

In [None]:
# Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Create PCA DataFrame
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df['species'] = [target_names[i] for i in y]

# Plot PCA results
plt.figure(figsize=(12, 5))

# PCA scatter plot
plt.subplot(1, 2, 1)
for species in target_names:
    subset = pca_df[pca_df['species'] == species]
    plt.scatter(subset['PC1'], subset['PC2'], label=species, alpha=0.7, s=50)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('PCA: First Two Principal Components')
plt.legend()
plt.grid(True, alpha=0.3)

# Explained variance plot
plt.subplot(1, 2, 2)
pca_full = PCA()
pca_full.fit(X)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)

plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA: Explained Variance Ratio')
plt.grid(True, alpha=0.3)
plt.xticks(range(1, len(cumulative_variance) + 1))

plt.tight_layout()
plt.show()

print(f"Explained variance by component: {pca.explained_variance_ratio_}")
print(f"Total explained variance (first 2 components): {sum(pca.explained_variance_ratio_):.3f}")

## 4. Model Comparison

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features for SVM and Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42, probability=True),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

# Train and evaluate models
results = {}
predictions = {}

for name, model in models.items():
    if name == 'Random Forest':
        # Random Forest doesn't need scaling
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    else:
        # SVM and Logistic Regression benefit from scaling
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    
    test_accuracy = (y_pred == y_test).mean()
    
    results[name] = {
        'Test Accuracy': test_accuracy,
        'CV Mean': cv_scores.mean(),
        'CV Std': cv_scores.std()
    }
    
    predictions[name] = y_pred

# Display results
results_df = pd.DataFrame(results).T
print("Model Comparison Results:")
print(results_df.round(4))

In [None]:
# Visualize model performance
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Test accuracy comparison
test_accuracies = [results[model]['Test Accuracy'] for model in models.keys()]
model_names = list(models.keys())

bars = axes[0].bar(model_names, test_accuracies, color=['skyblue', 'lightcoral', 'lightgreen'])
axes[0].set_ylabel('Test Accuracy')
axes[0].set_title('Model Test Accuracy Comparison')
axes[0].set_ylim(0.9, 1.0)
axes[0].grid(True, alpha=0.3)

# Add value labels on bars
for bar, acc in zip(bars, test_accuracies):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001, 
                f'{acc:.3f}', ha='center', va='bottom')

# Cross-validation scores with error bars
cv_means = [results[model]['CV Mean'] for model in models.keys()]
cv_stds = [results[model]['CV Std'] for model in models.keys()]

bars2 = axes[1].bar(model_names, cv_means, yerr=cv_stds, 
                   color=['skyblue', 'lightcoral', 'lightgreen'], 
                   capsize=5, alpha=0.8)
axes[1].set_ylabel('Cross-Validation Accuracy')
axes[1].set_title('Cross-Validation Performance (5-fold)')
axes[1].set_ylim(0.9, 1.0)
axes[1].grid(True, alpha=0.3)

# Add value labels
for bar, mean, std in zip(bars2, cv_means, cv_stds):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + std + 0.002, 
                f'{mean:.3f}±{std:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# Confusion matrices for all models
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, (name, y_pred) in enumerate(predictions.items()):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=target_names, yticklabels=target_names,
                ax=axes[i])
    axes[i].set_title(f'{name}\nAccuracy: {results[name]["Test Accuracy"]:.3f}')
    axes[i].set_xlabel('Predicted')
    if i == 0:
        axes[i].set_ylabel('Actual')

plt.tight_layout()
plt.show()

## 5. Feature Importance Analysis

In [None]:
# Feature importance from Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

feature_importance = rf_model.feature_importances_
feature_names_short = [name.replace(' (cm)', '').replace(' ', '\n') for name in feature_names]

# Plot feature importance
plt.figure(figsize=(10, 6))
bars = plt.bar(feature_names_short, feature_importance, color='skyblue', alpha=0.8)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance (Random Forest)', fontweight='bold')
plt.grid(True, alpha=0.3)

# Add value labels on bars
for bar, importance in zip(bars, feature_importance):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, 
             f'{importance:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Print feature importance ranking
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

print("Feature Importance Ranking:")
for i, (_, row) in enumerate(importance_df.iterrows(), 1):
    print(f"{i}. {row['Feature']}: {row['Importance']:.4f}")

## 6. Decision Boundary Visualization (2D)

In [None]:
# Use the two most important features for 2D visualization
top_features_idx = np.argsort(feature_importance)[-2:]  # Top 2 features
X_2d = X[:, top_features_idx]
feature_names_2d = [feature_names[i] for i in top_features_idx]

X_train_2d, X_test_2d, y_train_2d, y_test_2d = train_test_split(
    X_2d, y, test_size=0.2, random_state=42, stratify=y
)

# Train models on 2D data
models_2d = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, (name, model) in enumerate(models_2d.items()):
    # Train model
    if name == 'Random Forest':
        model.fit(X_train_2d, y_train_2d)
    else:
        scaler_2d = StandardScaler()
        X_train_2d_scaled = scaler_2d.fit_transform(X_train_2d)
        model.fit(X_train_2d_scaled, y_train_2d)
    
    # Create mesh for decision boundary
    h = 0.02
    x_min, x_max = X_2d[:, 0].min() - 1, X_2d[:, 0].max() + 1
    y_min, y_max = X_2d[:, 1].min() - 1, X_2d[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    mesh_points = np.c_[xx.ravel(), yy.ravel()]
    
    if name == 'Random Forest':
        Z = model.predict(mesh_points)
    else:
        mesh_points_scaled = scaler_2d.transform(mesh_points)
        Z = model.predict(mesh_points_scaled)
    
    Z = Z.reshape(xx.shape)
    
    # Plot decision boundary
    axes[i].contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.RdYlBu)
    
    # Plot data points
    scatter = axes[i].scatter(X_2d[:, 0], X_2d[:, 1], c=y, 
                             cmap=plt.cm.RdYlBu, edgecolors='black', alpha=0.8)
    
    axes[i].set_xlabel(feature_names_2d[0])
    axes[i].set_ylabel(feature_names_2d[1])
    axes[i].set_title(f'{name} Decision Boundary')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Summary and Insights

In [None]:
print("="*60)
print("IRIS DATASET EXPLORATION SUMMARY")
print("="*60)

print("\n📊 DATASET CHARACTERISTICS:")
print(f"• Total samples: {len(df)}")
print(f"• Features: {len(feature_names)}")
print(f"• Classes: {len(target_names)} (balanced dataset)")
print(f"• No missing values")

print("\n🔍 KEY INSIGHTS:")
print(f"• Most important features: {', '.join(importance_df['Feature'].head(2).tolist())}")
print(f"• Petal measurements are more discriminative than sepal measurements")
print(f"• First 2 PCA components explain {sum(pca.explained_variance_ratio_):.1%} of variance")
print(f"• Strong correlation between petal length and width ({correlation_matrix.loc['petal length (cm)', 'petal width (cm)']:.3f})")

print("\n🎯 MODEL PERFORMANCE:")
best_model = max(results.keys(), key=lambda x: results[x]['Test Accuracy'])
best_accuracy = results[best_model]['Test Accuracy']
print(f"• Best performing model: {best_model} ({best_accuracy:.3f} accuracy)")
print(f"• All models achieve >93% accuracy")
print(f"• Setosa is easily separable from other species")
print(f"• Versicolor and Virginica have some overlap")

print("\n💡 RECOMMENDATIONS:")
print("• Random Forest is recommended for this dataset (no scaling required, robust)")
print("• Petal measurements alone could achieve good classification")
print("• Consider ensemble methods for production use")
print("• Feature engineering not necessary - original features are sufficient")

print("\n" + "="*60)