In [99]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
import pickle

In [78]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target

In [79]:
df.head

<bound method NDFrame.head of      sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                  5.1               3.5                1.4               0.2   
1                  4.9               3.0                1.4               0.2   
2                  4.7               3.2                1.3               0.2   
3                  4.6               3.1                1.5               0.2   
4                  5.0               3.6                1.4               0.2   
..                 ...               ...                ...               ...   
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4               2.3   
149                5.9               3.0                5.1               1.8  

In [80]:
df.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64

In [81]:
df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'species'],
      dtype='object')

In [82]:
df.species.value_counts()

species
0    50
1    50
2    50
Name: count, dtype: int64

In [83]:
# # Map target values to species names
# df['species'] = df['species'].map({1: 'Setosa', 2: 'Versicolor', 3: 'Virginica'})

# Encode target variable

In [84]:
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])

In [85]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


#  Splitting Data into Training and Testing Sets

In [86]:
X = df.drop(columns=['species'])  # Features
y = df['species']  # Encoded Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [87]:
X_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
22,4.6,3.6,1.0,0.2
15,5.7,4.4,1.5,0.4
65,6.7,3.1,4.4,1.4
11,4.8,3.4,1.6,0.2
42,4.4,3.2,1.3,0.2


In [88]:
X_test.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
73,6.1,2.8,4.7,1.2
18,5.7,3.8,1.7,0.3
118,7.7,2.6,6.9,2.3
78,6.0,2.9,4.5,1.5
76,6.8,2.8,4.8,1.4


In [89]:
print("Train-Test Split Completed!")
print(f"Training Data Regression: {X_train.shape}, Testing Data Regression: {X_test.shape},")

Train-Test Split Completed!
Training Data Regression: (120, 4), Testing Data Regression: (30, 4),


In [90]:
# Apply StandardScaler only to Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Model

In [107]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}


In [108]:
accuracies = {}

In [109]:
for name, model in models.items():
    if name == "Logistic Regression":
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    accuracies[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")


Logistic Regression Accuracy: 1.0000
Random Forest Accuracy: 1.0000
Decision Tree Accuracy: 1.0000


In [110]:
# Find the best model
best_model_name = max(accuracies, key=accuracies.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name} with Accuracy: {accuracies[best_model_name]:.4f}")



Best Model: Logistic Regression with Accuracy: 1.0000


# save the model

In [111]:
# Save models using pickle
with open("logistic_regression.pkl", "wb") as f:
    pickle.dump(log_reg_model, f)

with open("random_forest.pkl", "wb") as f:
    pickle.dump(rf_model, f)

with open("decision_tree.pkl", "wb") as f:
    pickle.dump(dt_model, f)

with open("scaler.pkl", "wb") as f:  # Save the scaler for later use
    pickle.dump(scaler, f)

print("Models saved successfully!")

Models saved successfully!


# Save the best model

In [112]:
# Save the best model
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("Best model saved as best_model.pkl")


Best model saved as best_model.pkl
