In [1]:
# preprocessing_iris.py

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Load dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

# 2. Preprocessing

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# (No missing values in Iris dataset, but we can fill if there were)
df.fillna(df.mean(numeric_only=True), inplace=True)

# Normalize numerical features
scaler = MinMaxScaler()
df[iris.feature_names] = scaler.fit_transform(df[iris.feature_names])

# Encode class label
encoder = OneHotEncoder(sparse_output=False)
encoded_labels = encoder.fit_transform(df[['species']])
encoded_df = pd.DataFrame(encoded_labels, columns=encoder.get_feature_names_out(['species']))
df = pd.concat([df.drop(columns=['species']), encoded_df], axis=1)

print("\nFirst 5 rows after preprocessing:")
print(df.head())

# 3. Exploration

# Summary statistics
print("\nSummary statistics:")
print(df.describe())

# Pairplot
sns.pairplot(pd.DataFrame(iris.data, columns=iris.feature_names)
             .assign(species=iris.target_names[iris.target]), hue="species")
plt.savefig("pairplot.png")
plt.close()

# Correlation heatmap
corr = pd.DataFrame(iris.data, columns=iris.feature_names).corr()
plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.savefig("heatmap.png")
plt.close()

# Boxplots for outlier detection
plt.figure(figsize=(8,6))
sns.boxplot(data=pd.DataFrame(iris.data, columns=iris.feature_names))
plt.title("Boxplot for Outlier Detection")
plt.savefig("boxplots.png")
plt.close()

# 4. Split function
def train_test_split_custom(df, test_size=0.2, random_state=None):
    np.random.seed(random_state)
    shuffled_df = df.sample(frac=1).reset_index(drop=True)
    test_len = int(len(df) * test_size)
    test_df = shuffled_df.iloc[:test_len]
    train_df = shuffled_df.iloc[test_len:]
    return train_df, test_df

# Example usage
train_df, test_df = train_test_split_custom(df, test_size=0.2, random_state=42)
print(f"\nTrain size: {len(train_df)}, Test size: {len(test_df)}")



Missing values per column:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64

First 5 rows after preprocessing:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0           0.222222          0.625000           0.067797          0.041667   
1           0.166667          0.416667           0.067797          0.041667   
2           0.111111          0.500000           0.050847          0.041667   
3           0.083333          0.458333           0.084746          0.041667   
4           0.194444          0.666667           0.067797          0.041667   

   species_setosa  species_versicolor  species_virginica  
0             1.0                 0.0                0.0  
1             1.0                 0.0                0.0  
2             1.0                 0.0                0.0  
3             1.0                 0.0                0.0  
4             1.0                 0.0   