# Exploratory Data Analysis - Yeast Dataset

This notebook performs comprehensive exploratory data analysis on the Yeast dataset to understand the data structure, distributions, and relationships between features.

In [13]:
# Import necessary libraries
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Import project modules
from src.data_loader import load_yeast_data
from src.config import *

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("🔍 Exploratory Data Analysis - Yeast Dataset")
print("=" * 50)

🔍 Exploratory Data Analysis - Yeast Dataset


In [14]:
# Load the raw data
print("1. Loading Yeast Dataset...")
print("=" * 30)

# Load the raw data
data = load_yeast_data()
print(f"Dataset shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
print("\nFirst 5 rows:")
print(data.head())

print("\nDataset Info:")
print(data.info())

print("\nBasic Statistics:")
print(data.describe())

1. Loading Yeast Dataset...
Dataset shape: (1484, 9)
Columns: ['mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'nuc', 'class']

First 5 rows:
             mcg   gvh   alm   mit  erl  pox   vac   nuc class
ADT1_YEAST  0.58  0.61  0.47  0.13  0.5  0.0  0.48  0.22   MIT
ADT2_YEAST  0.43  0.67  0.48  0.27  0.5  0.0  0.53  0.22   MIT
ADT3_YEAST  0.64  0.62  0.49  0.15  0.5  0.0  0.53  0.22   MIT
AAR2_YEAST  0.58  0.44  0.57  0.13  0.5  0.0  0.54  0.22   NUC
AATM_YEAST  0.42  0.44  0.48  0.54  0.5  0.0  0.48  0.22   MIT

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 1484 entries, ADT1_YEAST to G6PD_YEAST
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mcg     1484 non-null   float64
 1   gvh     1484 non-null   float64
 2   alm     1484 non-null   float64
 3   mit     1484 non-null   float64
 4   erl     1484 non-null   float64
 5   pox     1484 non-null   float64
 6   vac     1484 non-null   float64
 7   nuc     14

In [None]:
# Data Structure Analysis
print("2. Data Structure Analysis")
print("=" * 30)

# Check for missing values
print("Missing values:")
missing_values = data.isnull().sum()
print(missing_values)

# Check data types
print("\nData types:")
print(data.dtypes)

# Check for duplicates
print(f"\nDuplicate rows: {data.duplicated().sum()}")

# Feature names and descriptions
feature_names = ['mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'nuc']
feature_descriptions = {
    'mcg': 'McGeoch\'s method for signal sequence recognition',
    'gvh': 'von Heijne\'s method for signal sequence recognition',
    'alm': 'Score of the ALOM membrane spanning region prediction program',
    'mit': 'Score of discriminant analysis of the amino acid content of the N-terminal region (20 residues long) of mitochondrial and non-mitochondrial proteins',
    'erl': 'Presence of "HDEL" substring (thought to act as a signal for retention in the endoplasmic reticulum lumen)',
    'pox': 'Peroxisomal targeting signal in the C-terminus',
    'vac': 'Score of discriminant analysis of the amino acid content of vacuolar and extracellular proteins',
    'nuc': 'Score of discriminant analysis of nuclear localization signals of nuclear and non-nuclear proteins'
}

print("\nFeature Descriptions:")
for feature, desc in feature_descriptions.items():
    print(f"{feature}: {desc}")

SyntaxError: unterminated string literal (detected at line 20) (4084700134.py, line 20)

In [None]:
# Class Distribution Analysis
print("3. Class Distribution Analysis")
print("=" * 35)

# Get class distribution
class_counts = data['class'].value_counts()
print("Class distribution:")
print(class_counts)

print("\nClass percentages:")
class_percentages = data['class'].value_counts(normalize=True) * 100
print(class_percentages.round(2))

# Visualize class distribution
plt.figure(figsize=(15, 6))

# Bar plot
plt.subplot(1, 2, 1)
class_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Class Distribution', fontsize=14)
plt.xlabel('Class', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

# Pie chart
plt.subplot(1, 2, 2)
plt.pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Class Distribution (Percentage)', fontsize=14)

plt.tight_layout()
plt.show()

# Class imbalance analysis
print("\nClass Imbalance Analysis:")
max_count = class_counts.max()
min_count = class_counts.min()
imbalance_ratio = max_count / min_count
print(f"Most frequent class: {class_counts.idxmax()} ({max_count} samples)")
print(f"Least frequent class: {class_counts.idxmin()} ({min_count} samples)")
print(f"Imbalance ratio: {imbalance_ratio:.2f}")

In [None]:
# Feature Analysis
print("4. Feature Analysis")
print("=" * 25)

# Separate features and target
X = data[feature_names]
y = data['class']

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Feature statistics
print("\nFeature Statistics:")
feature_stats = X.describe()
print(feature_stats)

# Check for outliers using IQR method
print("\nOutlier Analysis (IQR method):")
for feature in feature_names:
    Q1 = X[feature].quantile(0.25)
    Q3 = X[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = X[(X[feature] < lower_bound) | (X[feature] > upper_bound)]
    print(f"{feature}: {len(outliers)} outliers ({len(outliers)/len(X)*100:.1f}%)")

In [None]:
# Feature Distributions
print("5. Feature Distributions")
print("=" * 30)

# Create distribution plots for all features
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for i, feature in enumerate(feature_names):
    ax = axes[i]
    
    # Histogram
    ax.hist(X[feature], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    ax.set_title(f'{feature} Distribution', fontsize=12)
    ax.set_xlabel(feature, fontsize=10)
    ax.set_ylabel('Frequency', fontsize=10)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle('Feature Distributions', fontsize=16, y=1.02)
plt.show()

# Box plots for outlier detection
plt.figure(figsize=(15, 8))
X.boxplot(figsize=(15, 8))
plt.title('Feature Box Plots - Outlier Detection', fontsize=16)
plt.xlabel('Features', fontsize=12)
plt.ylabel('Values', fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Correlation Analysis
print("6. Correlation Analysis")
print("=" * 25)

# Calculate correlation matrix
correlation_matrix = X.corr()

print("Feature Correlation Matrix:")
print(correlation_matrix.round(3))

# Visualize correlation matrix
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16)
plt.tight_layout()
plt.show()

# Find highly correlated features
print("\nHighly Correlated Features (|r| > 0.7):")
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = correlation_matrix.iloc[i, j]
        if abs(corr_val) > 0.7:
            high_corr_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], corr_val))

if high_corr_pairs:
    for feat1, feat2, corr in high_corr_pairs:
        print(f"{feat1} - {feat2}: {corr:.3f}")
else:
    print("No highly correlated features found.")

In [None]:
# Class-Feature Relationships
print("7. Class-Feature Relationships")
print("=" * 35)

# Create box plots for each feature by class
fig, axes = plt.subplots(2, 4, figsize=(20, 12))
axes = axes.flatten()

for i, feature in enumerate(feature_names):
    ax = axes[i]
    
    # Create box plot
    data.boxplot(column=feature, by='class', ax=ax)
    ax.set_title(f'{feature} by Class', fontsize=12)
    ax.set_xlabel('Class', fontsize=10)
    ax.set_ylabel(feature, fontsize=10)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.suptitle('Feature Distributions by Class', fontsize=16, y=1.02)
plt.show()

# Calculate mean values by class
print("\nMean Feature Values by Class:")
class_means = data.groupby('class')[feature_names].mean()
print(class_means.round(3))

In [None]:
# Data Preprocessing Preview
print("8. Data Preprocessing Preview")
print("=" * 35)

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"Original classes: {le.classes_}")
print(f"Encoded classes: {np.unique(y_encoded)}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Train set class distribution: {np.bincount(y_train)}")
print(f"Test set class distribution: {np.bincount(y_test)}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nScaled train set shape: {X_train_scaled.shape}")
print(f"Scaled test set shape: {X_test_scaled.shape}")
print(f"Scaled train set mean: {X_train_scaled.mean(axis=0).round(3)}")
print(f"Scaled train set std: {X_train_scaled.std(axis=0).round(3)}")

In [None]:
# Summary and Insights
print("9. Summary and Insights")
print("=" * 25)

print("📊 Dataset Summary:")
print(f"- Total samples: {len(data)}")
print(f"- Features: {len(feature_names)}")
print(f"- Classes: {len(data['class'].unique())}")
print(f"- Missing values: {data.isnull().sum().sum()}")
print(f"- Duplicate rows: {data.duplicated().sum()}")

print("\n📈 Key Insights:")
print(f"- Most frequent class: {class_counts.idxmax()} ({class_counts.max()} samples)")
print(f"- Least frequent class: {class_counts.idxmin()} ({class_counts.min()} samples)")
print(f"- Class imbalance ratio: {imbalance_ratio:.2f}")
print(f"- Feature correlation range: {correlation_matrix.values[np.triu_indices_from(correlation_matrix.values, k=1)].min():.3f} to {correlation_matrix.values[np.triu_indices_from(correlation_matrix.values, k=1)].max():.3f}")

print("\n🎯 Recommendations for ML:")
print("- Handle class imbalance (consider SMOTE or class weights)")
print("- Feature scaling is necessary (already applied)")
print("- Consider feature selection based on correlation analysis")
print("- Use stratified sampling for train/test split")
print("- Consider ensemble methods for better performance")

print("\n✅ EDA Complete! Ready for machine learning analysis.")