<a href="https://colab.research.google.com/github/gj0210/CMP7239/blob/main/Welcome_to_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
import joblib
import streamlit as st

class ModelTrainer:
    def __init__(self):
        self.models = {}
        self.best_params = {}

    def train_knn(self, X_train, y_train, param_grid=None, cv=5):
        """
        Train K-Nearest Neighbors classifier with hyperparameter tuning.
        """
        if param_grid is None:
            param_grid = {
                'n_neighbors': [3, 5, 7, 9, 11],
                'weights': ['uniform', 'distance'],
                'metric': ['euclidean', 'manhattan']
            }

        knn = KNeighborsClassifier()

        # Grid search for best parameters
        grid_search = GridSearchCV(
            knn, param_grid, cv=cv, scoring='accuracy', n_jobs=-1
        )
        grid_search.fit(X_train, y_train)

        # Store best model and parameters
        self.models['KNN'] = grid_search.best_estimator_
        self.best_params['KNN'] = grid_search.best_params_

        # Cross-validation scores
        cv_scores = cross_val_score(
            grid_search.best_estimator_, X_train, y_train, cv=cv
        )

        return {
            'model': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_,
            'cv_scores': cv_scores,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std()
        }

    def train_decision_tree(self, X_train, y_train, param_grid=None, cv=5):
        """
        Train Decision Tree classifier with hyperparameter tuning.
        """
        if param_grid is None:
            param_grid = {
                'max_depth': [3, 5, 7, 10, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'criterion': ['gini', 'entropy']
            }

        dt = DecisionTreeClassifier(random_state=42)

        # Grid search for best parameters
        grid_search = GridSearchCV(
            dt, param_grid, cv=cv, scoring='accuracy', n_jobs=-1
        )
        grid_search.fit(X_train, y_train)

        # Store best model and parameters
        self.models['Decision_Tree'] = grid_search.best_estimator_
        self.best_params['Decision_Tree'] = grid_search.best_params_

        # Cross-validation scores
        cv_scores = cross_val_score(
            grid_search.best_estimator_, X_train, y_train, cv=cv
        )

        return {
            'model': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_,
            'cv_scores': cv_scores,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'feature_importance': dict(zip(X_train.columns, grid_search.best_estimator_.feature_importances_))
        }

    def train_random_forest(self, X_train, y_train, param_grid=None, cv=5):
        """
        Train Random Forest classifier with hyperparameter tuning.
        """
        if param_grid is None:
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [3, 5, 7, 10, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2']
            }

        rf = RandomForestClassifier(random_state=42)

        # Grid search for best parameters
        grid_search = GridSearchCV(
            rf, param_grid, cv=cv, scoring='accuracy', n_jobs=-1
        )
        grid_search.fit(X_train, y_train)

        # Store best model and parameters
        self.models['Random_Forest'] = grid_search.best_estimator_
        self.best_params['Random_Forest'] = grid_search.best_params_

        # Cross-validation scores
        cv_scores = cross_val_score(
            grid_search.best_estimator_, X_train, y_train, cv=cv
        )

        return {
            'model': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_,
            'cv_scores': cv_scores,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'feature_importance': dict(zip(X_train.columns, grid_search.best_estimator_.feature_importances_))
        }

    def train_all_models(self, X_train, y_train, cv=5):
        """
        Train all models with optimized hyperparameters.
        """
        results = {}

        st.info("Training K-Nearest Neighbors...")
        results['KNN'] = self.train_knn(X_train, y_train, cv=cv)

        st.info("Training Decision Tree...")
        results['Decision_Tree'] = self.train_decision_tree(X_train, y_train, cv=cv)

        st.info("Training Random Forest...")
        results['Random_Forest'] = self.train_random_forest(X_train, y_train, cv=cv)

        return results

    def save_models(self, filepath_prefix='model'):
        """
        Save trained models to disk.
        """
        for name, model in self.models.items():
            filename = f"{filepath_prefix}_{name.lower()}.joblib"
            joblib.dump(model, filename)
            st.success(f"Model {name} saved as {filename}")

    def load_model(self, filepath):
        """
        Load a model from disk.
        """
        return joblib.load(filepath)

    def get_model_complexity(self):
        """
        Get complexity information for each model.
        """
        complexity = {}

        if 'KNN' in self.models:
            knn = self.models['KNN']
            complexity['KNN'] = {
                'n_neighbors': knn.n_neighbors,
                'weights': knn.weights,
                'metric': knn.metric,
                'complexity_score': knn.n_neighbors  # Lower is more complex
            }

        if 'Decision_Tree' in self.models:
            dt = self.models['Decision_Tree']
            complexity['Decision_Tree'] = {
                'max_depth': dt.max_depth,
                'min_samples_split': dt.min_samples_split,
                'min_samples_leaf': dt.min_samples_leaf,
                'n_leaves': dt.get_n_leaves(),
                'complexity_score': dt.get_n_leaves()  # More leaves = more complex
            }

        if 'Random_Forest' in self.models:
            rf = self.models['Random_Forest']
            complexity['Random_Forest'] = {
                'n_estimators': rf.n_estimators,
                'max_depth': rf.max_depth,
                'min_samples_split': rf.min_samples_split,
                'min_samples_leaf': rf.min_samples_leaf,
                'complexity_score': rf.n_estimators * (rf.max_depth or 10)  # Approximation
            }

        return complexity


ModuleNotFoundError: No module named 'streamlit'

In [2]:
%pip install streamlit

Collecting streamlit
  Downloading streamlit-1.48.0-py3-none-any.whl.metadata (9.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.48.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInst

The error `ModuleNotFoundError: No module named 'streamlit'` occurs because the `streamlit` library, which is imported in the code, is not available in the current Python environment. To fix this, you need to install the library. The `!pip install streamlit` command in the previous cell will install the necessary library.