In [29]:
# Import required libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [30]:
# Add the src directory to the path for importing our utility modules
sys.path.append('../')
from src.data_utils import load_and_clean_data, prepare_features_and_target, split_and_scale_data

In [31]:
# Set paths
DATA_DIR = '../Data'
os.makedirs(DATA_DIR, exist_ok=True)

In [32]:
from sklearn.datasets import load_iris
import pandas as pd
import os

data_path="data/raw/iris.csv"
iris = load_iris(as_frame=True)
df = iris.frame
os.makedirs(os.path.dirname(data_path), exist_ok=True)
df.to_csv(data_path, index=False)
print(f"Iris dataset saved to {data_path}")
# print(f"Downloaded data to {data_path}")
print(f"Dataset shape: {df.shape}")

Iris dataset saved to data/raw/iris.csv
Dataset shape: (150, 5)


In [37]:
# Load data from the saved CSV file
df = pd.read_csv(data_path)

# Display the first few rows
print("First 5 rows of the dataset:")
df.head()

First 5 rows of the dataset:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [13]:
# Dataset summary
print("\nDataset summary:")
df.info()

# Statistical summary
print("\nStatistical summary:")
df.describe()


Dataset summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB

Statistical summary:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [14]:
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64


In [15]:
# Check for zero values in columns where zero is not valid
zero_counts = {
    'sepal length (cm)': (df['sepal length (cm)'] == 0).sum(),
    'sepal width (cm)': (df['sepal width (cm)'] == 0).sum(),
    'petal length (cm)': (df['petal length (cm)'] == 0).sum(),
    'petal width (cm)': (df['petal width (cm)'] == 0).sum()
}

print("\nZero values (potentially missing data):") 
for col, count in zero_counts.items():
    print(f"{col}: {count} zeros ({count/len(df)*100:.2f}% of data)")


Zero values (potentially missing data):
sepal length (cm): 0 zeros (0.00% of data)
sepal width (cm): 0 zeros (0.00% of data)
petal length (cm): 0 zeros (0.00% of data)
petal width (cm): 0 zeros (0.00% of data)


In [16]:
# Print percentage of each class
outcome_counts = df['target'].value_counts(normalize=True) * 100
print(f"Percentage of Iris setosa (0): {outcome_counts[0]:.2f}%")
print(f"Percentage of Iris-versicolor(1): {outcome_counts[1]:.2f}%")
print(f"Percentage of Iris-virginica(2): {outcome_counts[2]:.2f}%")

Percentage of Iris setosa (0): 33.33%
Percentage of Iris-versicolor(1): 33.33%
Percentage of Iris-virginica(2): 33.33%


In [17]:
# Visualize feature correlations
plt.figure(figsize=(12, 10))
corr = df.corr()
#sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
# plt.title('Feature Correlations')
# plt.show()

<Figure size 1200x1000 with 0 Axes>

In [19]:
# Find correlations with the target variable
target_corr = corr['target'].sort_values(ascending=False)
print("\nFeature correlations with species outcome:")
print(target_corr)


Feature correlations with species outcome:
target               1.000000
petal width (cm)     0.956547
petal length (cm)    0.949035
sepal length (cm)    0.782561
sepal width (cm)    -0.426658
Name: target, dtype: float64


In [20]:
# Use our utility function to clean the data
df_cleaned = load_and_clean_data(data_path)

In [21]:
# Compare statistical summary after cleaning
print("\nStatistical summary:")
df.describe()


Statistical summary:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [38]:
df["SepalArea"] = df["sepal length (cm)"] * df["sepal width (cm)"]
df["PetalArea"] = df["petal length (cm)"] * df["petal width (cm)"]

df.rename(columns={"sepal length (cm)":"sepal_length","sepal width (cm)":"sepal_width","petal length (cm)":"petal_length",
                   "petal width (cm)":"petal_width"},inplace=True)

In [39]:
# Display the dataset with new features
print("Dataset with new features:")
df.head()

Dataset with new features:


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target,SepalArea,PetalArea
0,5.1,3.5,1.4,0.2,0,17.85,0.28
1,4.9,3.0,1.4,0.2,0,14.7,0.28
2,4.7,3.2,1.3,0.2,0,15.04,0.26
3,4.6,3.1,1.5,0.2,0,14.26,0.3
4,5.0,3.6,1.4,0.2,0,18.0,0.28


In [40]:
# Split features and target
X, y = prepare_features_and_target(df, target_column='target')

# Split into train/test sets and scale the features
X_train_scaled, X_test_scaled, y_train, y_test, scaler = split_and_scale_data(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train_scaled.shape[0]} samples")
print(f"Testing set: {X_test_scaled.shape[0]} samples")

Training set: 120 samples
Testing set: 30 samples


In [41]:
# Create a directory for processed data
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, 'processed')
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

In [42]:
# Save the prepared datasets
import joblib

# Save train and test data
train_data_path = os.path.join(PROCESSED_DATA_DIR, 'train_data.npz')
test_data_path = os.path.join(PROCESSED_DATA_DIR, 'test_data.npz')

np.savez(train_data_path, X=X_train_scaled, y=y_train)
np.savez(test_data_path, X=X_test_scaled, y=y_test)

# Save the scaler for later use
scaler_path = os.path.join(PROCESSED_DATA_DIR, 'scaler.joblib')
joblib.dump(scaler, scaler_path)

# Save feature names for reference
feature_names_path = os.path.join(PROCESSED_DATA_DIR, 'feature_names.joblib')
joblib.dump(X.columns.tolist(), feature_names_path)

print(f"Saved processed training data to {train_data_path}")
print(f"Saved processed testing data to {test_data_path}")
print(f"Saved scaler to {scaler_path}")
print(f"Saved feature names to {feature_names_path}")

Saved processed training data to ../Data\processed\train_data.npz
Saved processed testing data to ../Data\processed\test_data.npz
Saved scaler to ../Data\processed\scaler.joblib
Saved feature names to ../Data\processed\feature_names.joblib


In [43]:
# Also save the raw processed dataframe for reference
processed_data_path = os.path.join(PROCESSED_DATA_DIR, 'processed_iris.csv')
df.to_csv(processed_data_path, index=False)
print(f"Saved processed dataframe to {processed_data_path}")

Saved processed dataframe to ../Data\processed\processed_iris.csv
