# Data Visualization with Matplotlib and Seaborn

This notebook demonstrates comprehensive data visualization techniques using Matplotlib and Seaborn.

## Topics Covered:
- Line plots and time series
- Bar charts (single and grouped)
- Scatter plots
- Histograms and distributions
- Box plots and violin plots
- Heatmaps (correlation matrices)
- Pie charts
- Subplots and figure composition
- Customization: titles, labels, legends, colors, styles

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Enable inline plotting
%matplotlib inline

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Set random seed
np.random.seed(42)

# Figure size default
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 10

## 1. Generate Sample Data

We'll create multiple datasets for demonstrating different visualization techniques.

In [None]:
# Time series data
dates = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')
n_days = len(dates)

# Sales data with trend and seasonality
trend = np.linspace(100, 150, n_days)
seasonality = 20 * np.sin(2 * np.pi * np.arange(n_days) / 365)
noise = np.random.normal(0, 5, n_days)
sales = trend + seasonality + noise

ts_df = pd.DataFrame({
    'date': dates,
    'sales': sales
})

# Categorical data
categories = ['Electronics', 'Clothing', 'Food', 'Books', 'Home & Garden']
category_sales = [45000, 32000, 28000, 18000, 25000]
category_counts = [450, 820, 950, 380, 520]

category_df = pd.DataFrame({
    'category': categories,
    'sales': category_sales,
    'transactions': category_counts
})

# Scatter plot data (with correlation)
n_samples = 200
advertising_spend = np.random.uniform(1000, 10000, n_samples)
sales_revenue = 2.5 * advertising_spend + np.random.normal(0, 3000, n_samples)

scatter_df = pd.DataFrame({
    'advertising': advertising_spend,
    'revenue': sales_revenue
})

# Multi-variable data for correlation
n_obs = 300
corr_df = pd.DataFrame({
    'price': np.random.uniform(10, 100, n_obs),
    'advertising': np.random.uniform(500, 5000, n_obs),
    'competitor_price': np.random.uniform(15, 95, n_obs),
    'season': np.random.choice(['Winter', 'Spring', 'Summer', 'Fall'], n_obs)
})
corr_df['sales'] = (50 - 0.3 * corr_df['price'] + 0.002 * corr_df['advertising'] + 
                    0.2 * corr_df['competitor_price'] + np.random.normal(0, 5, n_obs))

print("Sample data generated successfully!")
print(f"Time series data: {len(ts_df)} rows")
print(f"Category data: {len(category_df)} rows")
print(f"Scatter data: {len(scatter_df)} rows")
print(f"Correlation data: {len(corr_df)} rows")

## 2. Line Plots

Line plots are ideal for visualizing trends over time or continuous data.

In [None]:
# Basic line plot
plt.figure(figsize=(12, 6))
plt.plot(ts_df['date'], ts_df['sales'], linewidth=2, color='steelblue', alpha=0.8)
plt.title('Daily Sales Over Time', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Sales ($)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Multiple line plots with legend
# Calculate rolling averages
ts_df['sales_7d_ma'] = ts_df['sales'].rolling(window=7).mean()
ts_df['sales_30d_ma'] = ts_df['sales'].rolling(window=30).mean()

plt.figure(figsize=(12, 6))
plt.plot(ts_df['date'], ts_df['sales'], label='Daily Sales', alpha=0.5, linewidth=1)
plt.plot(ts_df['date'], ts_df['sales_7d_ma'], label='7-Day Moving Average', linewidth=2, color='orange')
plt.plot(ts_df['date'], ts_df['sales_30d_ma'], label='30-Day Moving Average', linewidth=2, color='red')
plt.title('Sales with Moving Averages', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Sales ($)', fontsize=12)
plt.legend(loc='upper left', fontsize=10)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Bar Charts

Bar charts are excellent for comparing categorical data.

In [None]:
# Vertical bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(category_df['category'], category_df['sales'], color='skyblue', edgecolor='navy', linewidth=1.5)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'${height:,.0f}',
             ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.title('Sales by Category', fontsize=16, fontweight='bold')
plt.xlabel('Category', fontsize=12)
plt.ylabel('Total Sales ($)', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Horizontal bar chart
plt.figure(figsize=(10, 6))
plt.barh(category_df['category'], category_df['transactions'], color='coral', edgecolor='darkred', linewidth=1.5)
plt.title('Number of Transactions by Category', fontsize=16, fontweight='bold')
plt.xlabel('Number of Transactions', fontsize=12)
plt.ylabel('Category', fontsize=12)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Grouped bar chart
x = np.arange(len(categories))
width = 0.35

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width/2, category_df['sales']/1000, width, label='Sales ($K)', color='steelblue')
bars2 = ax.bar(x + width/2, category_df['transactions'], width, label='Transactions', color='orange')

ax.set_title('Sales and Transactions by Category', fontsize=16, fontweight='bold')
ax.set_xlabel('Category', fontsize=12)
ax.set_ylabel('Values', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels(categories, rotation=45, ha='right')
ax.legend(fontsize=10)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 4. Scatter Plots

Scatter plots reveal relationships and correlations between two continuous variables.

In [None]:
# Basic scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(scatter_df['advertising'], scatter_df['revenue'], 
            alpha=0.6, s=50, color='purple', edgecolors='black', linewidth=0.5)
plt.title('Advertising Spend vs Revenue', fontsize=16, fontweight='bold')
plt.xlabel('Advertising Spend ($)', fontsize=12)
plt.ylabel('Revenue ($)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Scatter plot with regression line
plt.figure(figsize=(10, 6))
plt.scatter(scatter_df['advertising'], scatter_df['revenue'], 
            alpha=0.6, s=50, color='green', edgecolors='black', linewidth=0.5, label='Data points')

# Add regression line
z = np.polyfit(scatter_df['advertising'], scatter_df['revenue'], 1)
p = np.poly1d(z)
plt.plot(scatter_df['advertising'], p(scatter_df['advertising']), 
         "r--", linewidth=2, label=f'Fit: y={z[0]:.2f}x+{z[1]:.2f}')

plt.title('Advertising Spend vs Revenue (with Trend Line)', fontsize=16, fontweight='bold')
plt.xlabel('Advertising Spend ($)', fontsize=12)
plt.ylabel('Revenue ($)', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Histograms and Distributions

Histograms show the distribution of a continuous variable.

In [None]:
# Basic histogram
plt.figure(figsize=(10, 6))
plt.hist(corr_df['sales'], bins=30, color='teal', edgecolor='black', alpha=0.7)
plt.title('Distribution of Sales', fontsize=16, fontweight='bold')
plt.xlabel('Sales Value', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.axvline(corr_df['sales'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {corr_df["sales"].mean():.2f}')
plt.axvline(corr_df['sales'].median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {corr_df["sales"].median():.2f}')
plt.legend(fontsize=10)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Seaborn histogram with KDE (Kernel Density Estimate)
plt.figure(figsize=(10, 6))
sns.histplot(corr_df['sales'], bins=30, kde=True, color='purple', edgecolor='black', alpha=0.6)
plt.title('Sales Distribution with KDE', fontsize=16, fontweight='bold')
plt.xlabel('Sales Value', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Multiple overlapping histograms
plt.figure(figsize=(10, 6))
for season in corr_df['season'].unique():
    season_data = corr_df[corr_df['season'] == season]['sales']
    plt.hist(season_data, bins=20, alpha=0.5, label=season)

plt.title('Sales Distribution by Season', fontsize=16, fontweight='bold')
plt.xlabel('Sales Value', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.legend(fontsize=10)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Box Plots and Violin Plots

These plots show distribution, quartiles, and outliers.

In [None]:
# Box plot
plt.figure(figsize=(10, 6))
sns.boxplot(data=corr_df, x='season', y='sales', palette='Set2')
plt.title('Sales Distribution by Season (Box Plot)', fontsize=16, fontweight='bold')
plt.xlabel('Season', fontsize=12)
plt.ylabel('Sales Value', fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Violin plot (combines box plot and KDE)
plt.figure(figsize=(10, 6))
sns.violinplot(data=corr_df, x='season', y='sales', palette='muted', inner='quartile')
plt.title('Sales Distribution by Season (Violin Plot)', fontsize=16, fontweight='bold')
plt.xlabel('Season', fontsize=12)
plt.ylabel('Sales Value', fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Side-by-side comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

sns.boxplot(data=corr_df, x='season', y='sales', palette='Set2', ax=axes[0])
axes[0].set_title('Box Plot', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Season', fontsize=12)
axes[0].set_ylabel('Sales Value', fontsize=12)
axes[0].grid(axis='y', alpha=0.3)

sns.violinplot(data=corr_df, x='season', y='sales', palette='muted', ax=axes[1])
axes[1].set_title('Violin Plot', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Season', fontsize=12)
axes[1].set_ylabel('Sales Value', fontsize=12)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Heatmaps (Correlation Matrices)

Heatmaps visualize correlation matrices and patterns in data.

In [None]:
# Correlation matrix heatmap
numerical_cols = corr_df.select_dtypes(include=[np.number])
correlation_matrix = numerical_cols.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=0.5, cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Create a pivot table for heatmap
pivot_data = corr_df.pivot_table(values='sales', index='season', 
                                  columns=pd.cut(corr_df['price'], bins=3, labels=['Low', 'Medium', 'High']),
                                  aggfunc='mean')

plt.figure(figsize=(8, 6))
sns.heatmap(pivot_data, annot=True, fmt='.1f', cmap='YlGnBu', linewidths=0.5)
plt.title('Average Sales by Season and Price Range', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Price Range', fontsize=12)
plt.ylabel('Season', fontsize=12)
plt.tight_layout()
plt.show()

## 8. Pie Charts

Pie charts show proportions and percentages.

In [None]:
# Basic pie chart
plt.figure(figsize=(10, 8))
colors = plt.cm.Set3(range(len(categories)))
explode = (0.05, 0, 0, 0, 0)  # Explode the first slice

plt.pie(category_df['sales'], labels=category_df['category'], autopct='%1.1f%%',
        startangle=90, colors=colors, explode=explode, shadow=True)
plt.title('Sales Distribution by Category', fontsize=16, fontweight='bold', pad=20)
plt.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
# Donut chart
plt.figure(figsize=(10, 8))
colors = plt.cm.Pastel1(range(len(categories)))

wedges, texts, autotexts = plt.pie(category_df['sales'], labels=category_df['category'], 
                                     autopct='%1.1f%%', startangle=90, colors=colors)

# Draw a white circle at the center to create donut
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title('Sales Distribution (Donut Chart)', fontsize=16, fontweight='bold', pad=20)
plt.axis('equal')
plt.tight_layout()
plt.show()

## 9. Seaborn Advanced Plots

Seaborn provides high-level statistical visualizations.

In [None]:
# Pairplot - shows relationships between all numerical variables
sns.pairplot(corr_df[['price', 'advertising', 'competitor_price', 'sales']], 
             diag_kind='kde', plot_kws={'alpha': 0.6})
plt.suptitle('Pairplot of Numerical Variables', y=1.02, fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Jointplot - scatter plot with marginal distributions
sns.jointplot(data=scatter_df, x='advertising', y='revenue', 
              kind='reg', height=8, color='purple', joint_kws={'alpha': 0.5})
plt.suptitle('Joint Plot: Advertising vs Revenue', y=1.02, fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 10. Subplots and Figure Composition

Creating multiple plots in a single figure for comprehensive analysis.

In [None]:
# 2x2 subplot grid
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Line plot
axes[0, 0].plot(ts_df['date'][:90], ts_df['sales'][:90], color='steelblue', linewidth=2)
axes[0, 0].set_title('Daily Sales (Q1)', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Date', fontsize=10)
axes[0, 0].set_ylabel('Sales ($)', fontsize=10)
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].tick_params(axis='x', rotation=45)

# Plot 2: Bar chart
axes[0, 1].bar(category_df['category'], category_df['sales'], color='coral')
axes[0, 1].set_title('Sales by Category', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Category', fontsize=10)
axes[0, 1].set_ylabel('Total Sales ($)', fontsize=10)
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(axis='y', alpha=0.3)

# Plot 3: Scatter plot
axes[1, 0].scatter(scatter_df['advertising'], scatter_df['revenue'], alpha=0.6, color='green')
axes[1, 0].set_title('Advertising vs Revenue', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Advertising ($)', fontsize=10)
axes[1, 0].set_ylabel('Revenue ($)', fontsize=10)
axes[1, 0].grid(True, alpha=0.3)

# Plot 4: Histogram
axes[1, 1].hist(corr_df['sales'], bins=25, color='purple', edgecolor='black', alpha=0.7)
axes[1, 1].set_title('Sales Distribution', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Sales Value', fontsize=10)
axes[1, 1].set_ylabel('Frequency', fontsize=10)
axes[1, 1].grid(axis='y', alpha=0.3)

plt.suptitle('Comprehensive Sales Analysis Dashboard', fontsize=18, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

In [None]:
# Complex layout with different sizes
fig = plt.figure(figsize=(14, 10))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# Large plot spanning 2x2
ax1 = fig.add_subplot(gs[0:2, 0:2])
ax1.plot(ts_df['date'], ts_df['sales'], linewidth=2, color='steelblue')
ax1.fill_between(ts_df['date'], ts_df['sales'], alpha=0.3)
ax1.set_title('Full Year Sales Trend', fontsize=14, fontweight='bold')
ax1.set_xlabel('Date', fontsize=11)
ax1.set_ylabel('Sales ($)', fontsize=11)
ax1.grid(True, alpha=0.3)
ax1.tick_params(axis='x', rotation=45)

# Top right plot
ax2 = fig.add_subplot(gs[0, 2])
ax2.pie(category_df['sales'][:3], labels=category_df['category'][:3], autopct='%1.1f%%')
ax2.set_title('Top 3 Categories', fontsize=12, fontweight='bold')

# Middle right plot
ax3 = fig.add_subplot(gs[1, 2])
ax3.bar(range(len(categories)), category_df['transactions'], color='orange')
ax3.set_title('Transactions', fontsize=12, fontweight='bold')
ax3.set_xticks(range(len(categories)))
ax3.set_xticklabels(categories, rotation=45, ha='right', fontsize=8)
ax3.grid(axis='y', alpha=0.3)

# Bottom row - 3 small plots
ax4 = fig.add_subplot(gs[2, 0])
ax4.hist(corr_df['price'], bins=20, color='green', alpha=0.7)
ax4.set_title('Price Distribution', fontsize=10, fontweight='bold')
ax4.set_xlabel('Price', fontsize=9)

ax5 = fig.add_subplot(gs[2, 1])
ax5.hist(corr_df['advertising'], bins=20, color='red', alpha=0.7)
ax5.set_title('Advertising Distribution', fontsize=10, fontweight='bold')
ax5.set_xlabel('Advertising', fontsize=9)

ax6 = fig.add_subplot(gs[2, 2])
sns.boxplot(data=corr_df, y='sales', ax=ax6, color='skyblue')
ax6.set_title('Sales Box Plot', fontsize=10, fontweight='bold')

plt.suptitle('Advanced Multi-Panel Dashboard', fontsize=18, fontweight='bold')
plt.show()

## Summary

In this notebook, we covered comprehensive data visualization techniques:

### Matplotlib:
1. **Line Plots**: Trends over time, multiple series, moving averages
2. **Bar Charts**: Vertical, horizontal, grouped bars with labels
3. **Scatter Plots**: Relationships, correlations, regression lines
4. **Histograms**: Distributions, frequency analysis
5. **Pie Charts**: Proportions, donut charts
6. **Subplots**: Complex multi-panel layouts

### Seaborn:
1. **Statistical Plots**: histplot with KDE, box plots, violin plots
2. **Heatmaps**: Correlation matrices, pivot tables
3. **Advanced Plots**: pairplot, jointplot

### Customization:
- Titles, labels, legends
- Colors, styles, transparency
- Grid lines, annotations
- Figure size and layout

These visualization techniques are essential for exploratory data analysis and communicating insights effectively.