In [1]:
# Task 1: Load and Explore the Dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

# Load the Iris dataset
iris_data = load_iris()
df = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
df['species'] = iris_data.target_names[iris_data.target]

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Dataset structure (info and missing values check)
print("\nDataset info (Data types and missing values):")
print(df.info())

# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())

# Step 2: Basic Data Analysis
# Basic statistics for numerical columns
print("\nBasic statistics of numerical columns:")
print(df.describe())

# Group by 'species' and compute mean for numerical columns
print("\nGroup by 'species' and compute mean:")
print(df.groupby('species').mean())

# Task 3: Data Visualization

# 1. Line Chart: Trend of Sepal Length over index (for demonstration purposes)
plt.figure(figsize=(10, 6))
plt.plot(df.index, df['sepal length (cm)'], label='Sepal Length')
plt.title('Line Chart: Sepal Length Trend')
plt.xlabel('Index')
plt.ylabel('Sepal Length (cm)')
plt.legend()
plt.show()

# 2. Bar Chart: Average Petal Length per Species
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='species', y='petal length (cm)')
plt.title('Bar Chart: Average Petal Length per Species')
plt.xlabel('Species')
plt.ylabel('Average Petal Length (cm)')
plt.show()

# 3. Histogram: Distribution of Sepal Width
plt.figure(figsize=(10, 6))
plt.hist(df['sepal width (cm)'], bins=15, edgecolor='black', color='skyblue')
plt.title('Histogram: Distribution of Sepal Width')
plt.xlabel('Sepal Width (cm)')
plt.ylabel('Frequency')
plt.show()

# 4. Scatter Plot: Sepal Length vs. Petal Length
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='sepal length (cm)', y='petal length (cm)', hue='species')
plt.title('Scatter Plot: Sepal Length vs Petal Length')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Petal Length (cm)')
plt.legend(title='Species')
plt.show()

# Task 4: Summary of Findings
# Observations:
# - The petal length is the most distinguishing feature between species.
# - The distribution of sepal width is relatively uniform, with no major outliers.
# - There is a clear relationship between sepal length and petal length, especially when grouped by species.


ModuleNotFoundError: No module named 'sklearn'