# Data Analysis Practice Notebook

This notebook covers data analysis, visualization, and best practices using Python.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Data Generation and Loading
Generating synthetic data to simulate a business dataset.

In [None]:
# Generating sample data
print("Generating sample data...")
data = {
    "Category": np.random.choice(["A", "B", "C", "D"], size=100),
    "Sales": np.random.randint(100, 1000, size=100),
    "Profit": np.random.uniform(10, 500, size=100),
    "Discount": np.random.uniform(0, 0.5, size=100),
    "Quantity": np.random.randint(1, 10, size=100)
}
df = pd.DataFrame(data)
print("Sample Data:")
print(df.head())

## 2. Data Cleaning
Checking for missing values and handling them if necessary.

In [None]:
# Checking for missing values
print("\nChecking for missing values...")
print(df.isnull().sum())

## 3. Summary Statistics
Understanding dataset characteristics with descriptive statistics.

In [None]:
# Summary statistics
print("\nSummary Statistics:")
print(df.describe())

## 4. Data Visualization
Using various plots to analyze and interpret data.

In [None]:
# Set style for seaborn
sns.set(style="whitegrid")

# Sales Distribution - Understanding sales data distribution
plt.figure(figsize=(6,4))
sns.histplot(df["Sales"], bins=20, kde=True, color='blue')
plt.title("Sales Distribution")
plt.xlabel("Sales")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Profit vs. Sales Scatter Plot - Identifying relationship between sales and profit
plt.figure(figsize=(6,4))
sns.scatterplot(x=df["Sales"], y=df["Profit"], hue=df["Category"], palette="viridis")
plt.title("Profit vs. Sales by Category")
plt.xlabel("Sales")
plt.ylabel("Profit")
plt.show()

In [None]:
# Boxplot for Discount by Category - Understanding discount variations
plt.figure(figsize=(6,4))
sns.boxplot(x=df["Category"], y=df["Discount"], palette="pastel")
plt.title("Discount Distribution by Category")
plt.xlabel("Category")
plt.ylabel("Discount")
plt.show()

## 5. Best Practices for Data Analysis
Guidelines to ensure effective and efficient analysis.

In [None]:
# Listing best practices
print("\nBest Practices:")
print("1. Always check for missing values and outliers before analysis.")
print("2. Use appropriate visualizations to explore data distributions and relationships.")
print("3. Normalize or scale data when necessary for better model performance.")
print("4. Ensure data consistency and remove duplicates before analysis.")
print("5. Document assumptions and findings throughout the process for reproducibility.")

## 6. Saving Processed Data
Storing the cleaned and analyzed data for future use.

In [None]:
# Save processed data to CSV
df.to_csv("processed_data.csv", index=False)
print("Processed data saved to processed_data.csv")