In [None]:
# -------------------------------
# Data Analysis & Visualization Project
# Using the Iris Dataset
# -------------------------------

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

# Task 1: Load and Explore the Dataset

try:
    # Load the Iris dataset from sklearn
    iris = load_iris(as_frame=True)
    df = iris.frame  # DataFrame with features + target
    df['species'] = df['target'].map(dict(enumerate(iris.target_names)))

    # Display first few rows
    print("First 5 rows of the dataset:\n")
    print(df.head(), "\n")

    # Check structure of dataset
    print("Dataset Info:\n")
    print(df.info(), "\n")

    # Check for missing values
    print("Missing values:\n")
    print(df.isnull().sum(), "\n")

    # Clean dataset (no missing values in Iris, but example code)
    df = df.dropna()

except FileNotFoundError:
    print("Error: File not found. Please check the dataset path.")
except Exception as e:
    print("An error occurred while loading the dataset:", str(e))

# Task 2: Basic Data Analysis

# Basic statistics
print("Basic Statistics of Numerical Columns:\n")
print(df.describe(), "\n")

# Grouping by species
group_means = df.groupby("species")[["sepal length (cm)", "sepal width (cm)", 
                                     "petal length (cm)", "petal width (cm)"]].mean()
print("Mean values per species:\n")
print(group_means, "\n")

# Interesting finding example
print("Observation: Iris-setosa generally has much smaller petal length/width compared to other species.\n")


# Task 3: Data Visualization

# a. Line chart (trend of sepal length for first 30 samples)
plt.figure(figsize=(8,5))
plt.plot(df.index[:30], df["sepal length (cm)"][:30], marker='o')
plt.title("Line Chart: Sepal Length of First 30 Samples")
plt.xlabel("Sample Index")
plt.ylabel("Sepal Length (cm)")
plt.grid(True)
plt.show()

# b. Bar chart (average petal length per species)
plt.figure(figsize=(8,5))
sns.barplot(x="species", y="petal length (cm)", data=df, ci=None)
plt.title("Average Petal Length per Species")
plt.xlabel("Species")
plt.ylabel("Average Petal Length (cm)")
plt.show()

# c. Histogram (distribution of sepal width)
plt.figure(figsize=(8,5))
plt.hist(df["sepal width (cm)"], bins=15, color="skyblue", edgecolor="black")
plt.title("Histogram: Sepal Width Distribution")
plt.xlabel("Sepal Width (cm)")
plt.ylabel("Frequency")
plt.show()

# d. Scatter plot (sepal length vs petal length)
plt.figure(figsize=(8,5))
sns.scatterplot(x="sepal length (cm)", y="petal length (cm)", hue="species", data=df, s=70)
plt.title("Scatter Plot: Sepal Length vs Petal Length")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Petal Length (cm)")
plt.legend(title="Species")
plt.show()
