# Data Exploration with Python

## Objectives

- Load and inspect datasets
- Perform basic exploratory data analysis
- Visualize data distributions
- Identify data quality issues
- Calculate summary statistics

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('âœ“ Libraries imported successfully!')

## 1. Loading Data

Let's start by loading a sample dataset.

In [None]:
# Load Iris dataset
from sklearn.datasets import load_iris

iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target
df['species_name'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print(f'Dataset shape: {df.shape}')
print(f'Columns: {df.columns.tolist()}')
df.head()

## 2. Basic Information

In [None]:
# Dataset info
print('Dataset Info:')
df.info()

print('\nMissing Values:')
print(df.isnull().sum())

print('\nData Types:')
print(df.dtypes)

## 3. Summary Statistics

In [None]:
# Descriptive statistics
df.describe()

## 4. Data Visualization

### Distribution Plots

In [None]:
# Histograms
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for i, col in enumerate(iris.feature_names):
    ax = axes[i//2, i%2]
    df[col].hist(bins=20, ax=ax, edgecolor='black', alpha=0.7)
    ax.set_title(col)
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

### Box Plots

In [None]:
# Box plots by species
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for i, col in enumerate(iris.feature_names):
    ax = axes[i//2, i%2]
    df.boxplot(column=col, by='species_name', ax=ax)
    ax.set_title(col)
    ax.set_xlabel('Species')

plt.suptitle('')
plt.tight_layout()
plt.show()

### Scatter Plots

In [None]:
# Pairplot
sns.pairplot(df, hue='species_name', height=3)
plt.suptitle('Iris Dataset Pairplot', y=1.02)
plt.show()

### Correlation Matrix

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))

corr_matrix = df[iris.feature_names].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

print('\nCorrelation Matrix:')
print(corr_matrix)

## 5. Group Analysis

In [None]:
# Group by species
species_stats = df.groupby('species_name')[iris.feature_names].agg(['mean', 'std', 'min', 'max'])
print('Statistics by Species:')
print(species_stats)

## 6. Data Quality Checks

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f'Number of duplicate rows: {duplicates}')

# Check for outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return len(outliers)

print('\nOutlier Detection (IQR method):')
for col in iris.feature_names:
    n_outliers = detect_outliers(df, col)
    print(f'{col}: {n_outliers} outliers')

## Key Findings

1. Dataset has 150 samples across 3 species
2. No missing values
3. Petal measurements show strong correlation
4. Species are separable based on features
5. Minimal outliers detected

## Next Steps

- Feature engineering
- Machine learning modeling
- Further statistical analysis