In [1]:
# 1. Import all the required Python libraries
import pandas as pd           # For data manipulation and analysis
import numpy as np            # For numerical operations
import seaborn as sns         # For data visualization (optional)
import matplotlib.pyplot as plt  # For plotting (optional)

# 2. Load the dataset from a CSV file
# Dataset source: https://www.kaggle.com/datasets/uciml/iris
# Make sure the CSV file is in the same folder as this script
df = pd.read_csv("iris.csv")  # Reads the iris dataset into a pandas DataFrame

# 3. Data Preprocessing

# Check for missing values in each column
print("🔍 Missing values in each column:")
print(df.isnull().sum())  # Displays count of null (missing) values per column

# View summary statistics for numeric columns
print("\n📊 Summary statistics of numeric columns:")
print(df.describe())  # Shows count, mean, std, min, 25%, 50%, 75%, max

# Check the shape of the DataFrame (rows, columns)
print("\n📐 Shape of the dataset:")
print(df.shape)  # Prints the number of rows and columns

# Show all column names and their data types
print("\n🧾 Column names and data types:")
print(df.dtypes)  # Shows the type of each column

# 4. Data Formatting and Normalization

# If needed: Convert data types (e.g., ensure numeric columns are float)
# For this dataset, types are already correct, so no changes are made
# Example (if needed): df['sepal.length'] = df['sepal.length'].astype(float)

# Double-check data types after any type conversions
print("\n✅ Data types after checking:")
print(df.dtypes)

# 5. Categorical Encoding
# Convert the 'variety' column (categorical) into numeric codes
df['variety_encoded'] = df['variety'].astype('category').cat.codes  # Add encoded column

# Show original and encoded variety values side by side
print("\n🌸 Original and encoded 'variety' values:")
print(df[['variety', 'variety_encoded']].head())

# 6. Final Preview
# Display first 5 rows of the final DataFrame
print("\n🔚 First 5 rows of the final DataFrame:")
print(df.head())

# (Optional) Visualize with pairplot if desired
# sns.pairplot(df, hue='variety')
# plt.show()


🔍 Missing values in each column:
sepal.length    0
sepal.width     0
petal.length    0
petal.width     0
variety         0
dtype: int64

📊 Summary statistics of numeric columns:
       sepal.length  sepal.width  petal.length  petal.width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000

📐 Shape of the dataset:
(150, 5)

🧾 Column names and data types:
sepal.length    float64
sepal.width     float64
petal.length    float64
petal.width     float64
variety          object
dtype: object

✅ Data types after checking:
sepal.length    float64
sepal.width     float64
peta