# Pandas + ML — Part 3
End-to-end mini project:
- Load cleaned Iris data
- Train/test split
- Simple ML model (Logistic Regression)
- Evaluate with accuracy & confusion matrix
- Show feature importance coefficients


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

%matplotlib inline

CSV_PATH = "ml_projects/data/iris_cleaned_engineered.csv"  # produced in part 2
df = pd.read_csv(CSV_PATH)
df.head()

In [None]:
# Features and target
X = df.drop(columns=['species','species_index'])
y = df['species']

print("Feature shape:", X.shape)
print("Target shape:", y.shape)
X.head()

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

In [None]:
# Train logistic regression (multi-class, simple baseline)
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Coefficients (feature importance for logistic regression)
coef_df = pd.DataFrame(model.coef_, columns=X.columns, index=model.classes_)
coef_df.T

## Next steps
- Try other models: DecisionTreeClassifier, RandomForestClassifier
- Add cross-validation
- Compare models with accuracy and F1-score
- Export trained model with joblib
