# Data Analysis Template

This notebook provides a template for analyzing datasets with descriptions and visualizations.

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path

# Set style for better-looking plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 2. Load Data

Load your dataset from the `data/raw/` directory.

In [None]:
# Example: Load a CSV file
# df = pd.read_csv('../data/raw/your_data.csv')

# Example: Create sample data
np.random.seed(42)
df = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=100),
    'category': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'value': np.random.randn(100).cumsum(),
    'amount': np.random.randint(10, 100, 100)
})

print(f"Dataset shape: {df.shape}")
df.head()

## 3. Data Description

Get statistical summary and information about the dataset.

In [None]:
# Dataset information
print("Dataset Info:")
print(df.info())
print("\n" + "="*50 + "\n")

# Statistical description
print("Statistical Description:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
missing_data = df.isnull().sum()
print(missing_data[missing_data > 0])

if missing_data.sum() == 0:
    print("No missing values found!")

In [None]:
# Data types and unique values
print("Data Types and Unique Values:")
for col in df.columns:
    print(f"{col}: {df[col].dtype}, {df[col].nunique()} unique values")

## 4. Data Visualization

Create various plots to understand the data distribution and patterns.

### 4.1 Distribution Plots

In [None]:
# Distribution of numerical columns
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].hist(df['value'], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_title('Distribution of Value')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')

axes[1].hist(df['amount'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title('Distribution of Amount')
axes[1].set_xlabel('Amount')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

### 4.2 Category Analysis

In [None]:
# Count plot for categories
plt.figure(figsize=(10, 6))
category_counts = df['category'].value_counts()
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title('Distribution by Category')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()

print("\nCategory Distribution:")
print(category_counts)

### 4.3 Time Series Analysis

In [None]:
# Time series plot
plt.figure(figsize=(15, 6))
plt.plot(df['date'], df['value'], marker='o', linestyle='-', linewidth=2, markersize=4)
plt.title('Value Over Time')
plt.xlabel('Date')
plt.ylabel('Value')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### 4.4 Interactive Plots with Plotly

In [None]:
# Interactive scatter plot
fig = px.scatter(df, x='value', y='amount', color='category',
                 title='Value vs Amount by Category',
                 hover_data=['date'])
fig.show()

In [None]:
# Interactive line plot
fig = px.line(df, x='date', y='value', color='category',
              title='Value Over Time by Category')
fig.show()

### 4.5 Box Plots and Violin Plots

In [None]:
# Box plot
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

sns.boxplot(data=df, x='category', y='value', ax=axes[0])
axes[0].set_title('Value Distribution by Category (Box Plot)')
axes[0].set_xlabel('Category')
axes[0].set_ylabel('Value')

sns.violinplot(data=df, x='category', y='amount', ax=axes[1])
axes[1].set_title('Amount Distribution by Category (Violin Plot)')
axes[1].set_xlabel('Category')
axes[1].set_ylabel('Amount')

plt.tight_layout()
plt.show()

### 4.6 Correlation Analysis

In [None]:
# Correlation matrix for numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numerical_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

## 5. Summary Statistics by Category

In [None]:
# Group by category and calculate statistics
summary = df.groupby('category').agg({
    'value': ['mean', 'median', 'std', 'min', 'max'],
    'amount': ['mean', 'median', 'std', 'min', 'max']
})

print("Summary Statistics by Category:")
summary

## 6. Save Processed Data

In [None]:
# Save processed data
# df.to_csv('../data/processed/processed_data.csv', index=False)
# print("Processed data saved successfully!")

## 7. Conclusions

Write your conclusions and insights here:

- Key finding 1
- Key finding 2
- Key finding 3