In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("IRIS.csv")
print("Dataset Shape:", df.shape)
print("\nFirst 5 Rows:\n", df.head())

Dataset Shape: (129, 5)

First 5 Rows:
    sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [3]:
# Check number of classes 
pd.unique(df["species"])

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [4]:
# Preprocessing
# Encode the target variable
from sklearn.preprocessing import StandardScaler, LabelEncoder

if df["species"].dtype == 'object':
    le = LabelEncoder()
    df["species"] = le.fit_transform(df["species"])

pd.unique(df["species"])

array([0, 1, 2])

In [5]:
# Check for dataset imbalance
print("Number of samples in each class:", df["species"].value_counts())


Number of samples in each class: species
0    50
1    50
2    29
Name: count, dtype: int64


In [8]:
# Now balance the dataset
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

# compute class weights for each instance
class_weights = compute_class_weight(class_weight = 'balanced', classes = np.unique(df["species"]), y = df["species"])
class_weights_dict = {cls: weight for cls, weight in zip(np.unique(df["species"]), class_weights)}

# compute sample weights for each instance
sample_weights = compute_sample_weight(class_weight = class_weights_dict, y = df["species"])

class_weights_dict

{np.int64(0): np.float64(0.86),
 np.int64(1): np.float64(0.86),
 np.int64(2): np.float64(1.4827586206896552)}

In [9]:
# Split features and target variable
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
 
X = df.drop("species", axis = 1)
y = df["species"]

# feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# train-test split
X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(X_scaled, y, sample_weights, test_size = 0.3, stratify = y, random_state = 42)

#### Assignment 1 - perform EDA on the dataset

In [None]:
# ASSIGNMENT BEGIN
# correlation analysis, time series analysis, anomaly detection, null/missing values 


In [12]:
#ASSIGNMENT END
# Multiclass Classification without dataset balancing
# LogReg (OVR)
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, log_loss, roc_auc_score, roc_curve

lr_ovr = OneVsRestClassifier(LogisticRegression())
lr_ovr.fit(X_train, y_train)
y_pred_lrovr = lr_ovr.predict(X_test)

print("Logistic Regression (OVR) Performance: ")
print(classification_report(y_test, y_pred_lrovr))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lrovr))


Logistic Regression (OVR) Performance: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.93      0.87      0.90        15
           2       0.80      0.89      0.84         9

    accuracy                           0.92        39
   macro avg       0.91      0.92      0.91        39
weighted avg       0.93      0.92      0.92        39

Confusion Matrix:
 [[15  0  0]
 [ 0 13  2]
 [ 0  1  8]]


In [15]:
from sklearn import set_config

# Enable metadata routing for OVRClassifier
set_config(enable_metadata_routing = True)

# train model with sample weights using OVR with Logistic Regression
clf = OneVsRestClassifier(LogisticRegression(class_weight = class_weights_dict, max_iter=10000))
clf.estimator.set_fit_request(sample_weight = True)  # Explicitly request sample_weight
clf.fit(X_train, y_train, sample_weight = sw_train)

y_pred = clf.predict(X_test)

# Classification Report and Confusion Matrix
print("\nLogistic Regression (One-vs-Rest) Performance balanced weights:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Logistic Regression (One-vs-Rest) Performance balanced weights:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      0.60      0.75        15
           2       0.60      1.00      0.75         9

    accuracy                           0.85        39
   macro avg       0.87      0.87      0.83        39
weighted avg       0.91      0.85      0.85        39

Confusion Matrix:
[[15  0  0]
 [ 0  9  6]
 [ 0  0  9]]
