In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, r2_score
from xgboost import  XGBClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, r2_score
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from itertools import cycle
from sklearn.metrics import auc, confusion_matrix, f1_score, precision_score, recall_score, roc_curve
from sklearn.model_selection import learning_curve
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import RocCurveDisplay
from datetime import date

In [None]:
class EnhancedMLModel:
    def __init__(self):
        self.priority_mapping = {'High': 2, 'Medium': 1, 'Low': 0}
        self.priority_reverse_mapping = {2: 'High', 1: 'Medium', 0: 'Low'}
        self.priority_encoder = LabelEncoder()
        self.user_tasks = None
        self.duration_features = None
        self.priority_features = None
        self._is_fitted = False

    def Preprocessing_Features(self, data, calendar_events, reminders, event_schedule, user_settings, workload_insights):
        """Enhanced feature engineering using all available datasets"""
        features = data.copy()   
             
        features['Priority'] = features['Priority'].astype(str)
        features['Status'] = features['Status'].astype(str)


        def convert_minutes_to_datetime(minutes_str):
            try:
                # Split the string and convert to float
                total_minutes, seconds = map(float, minutes_str.split(':'))
                
                # Convert total minutes to hours and minutes
                hours = int(total_minutes // 60)
                minutes = int(total_minutes % 60)
                
                # Combine with seconds
                time_str = f"{hours:02}:{minutes:02}:{int(seconds):02}"
                
                # Convert to datetime
                return pd.to_datetime(time_str, format='%H:%M:%S')
            except:
                # If conversion fails, return NaT
                return pd.NaT
            
        features["Deadline"] = features["Deadline"].apply(convert_minutes_to_datetime)
        features["Created At"] = features["Created At"].apply(convert_minutes_to_datetime)

        #estimted duration column are in minutes
        features["Estimated Duration (min)"] = pd.to_timedelta(features["Estimated Duration (min)"], unit='m')

        # Extract hours, minutes, and seconds
        features["Estimated Duration (min)"] = features["Estimated Duration (min)"].apply(lambda x: 
            pd.to_datetime(f"{x.seconds // 3600:02}:{(x.seconds % 3600) // 60:02}:{x.seconds % 60:02}", format='%H:%M:%S').time())
        


        def time_difference_in_minutes(deadline, created_at):
            if deadline is None or created_at is None:
                return None
            
            # Convert each time to minutes since midnight
            deadline_minutes = deadline.hour * 60 + deadline.minute + deadline.second/60
            created_minutes = created_at.hour * 60 + created_at.minute + created_at.second/60
            
            # Calculate difference
            diff = deadline_minutes - created_minutes
            
            # Handle case where deadline is on the next day (crosses midnight)
            if diff < 0:
                diff += 24 * 60  # Add 24 hours worth of minutes
                
            return diff

        features['actual_duration'] = features.apply(lambda row: time_difference_in_minutes(row['Deadline'], row['Created At']), axis=1)


        features = features[features['actual_duration'] > 0]

        # i want to calc time until deadline from diff between deadline and the time of the day
        features["time_until_deadline"] = features["Deadline"].apply(lambda x: (x - datetime.now()).total_seconds() / 60)

        



        # 2. Time-based Features


        print(" actual duration minutes ",features["actual_duration"].head())
        
        # 3. User Workload Features
        user_workload = workload_insights.groupby('User ID').agg({
            'Total Tasks': 'mean',
            'Completed Tasks': 'mean',
            'Avg. Task Duration': 'mean',
            'Productivity Score': 'mean'
        }).reset_index()
        
        features = features.merge(user_workload, on='User ID', how='left')
       
        
        # 4. User Settings Features
        integration_columns = ['Trello Linked', 'Email Linked']
        for col in integration_columns:
            user_settings[col] = (user_settings[col] == 'Yes').astype(int)
        
        user_settings['has_integrations'] = user_settings[integration_columns].sum(axis=1)
        features = features.merge(
            user_settings[['User ID', 'has_integrations']], 
            on='User ID', how='left'
        )

        
        
        # 5. Calendar Conflict Features
        def get_calendar_conflicts(row, calendar_events):
            user_events = calendar_events[calendar_events['User ID'] == row['User ID']]
            deadline = row['Deadline']
            conflicts = len(user_events[
                (pd.to_datetime(user_events['Start Time']) <= deadline) & 
                (pd.to_datetime(user_events['End Time']) >= deadline)
            ])
            return conflicts
        
        features['calendar_conflicts'] = features.apply(
            lambda x: get_calendar_conflicts(x, calendar_events), axis=1
        )
        
        # 6. Reminder Features
        reminders_per_user = reminders.groupby('User ID').size().reset_index(name='reminder_count')
        features = features.merge(reminders_per_user, on='User ID', how='left')
        features['reminder_count'] = features['reminder_count'].fillna(0)

        if not hasattr(self.priority_encoder, 'classes_'): 
            features['priority_encoded'] = self.priority_encoder.fit_transform(features['Priority'])
        else:
            features['Priority'] = features['Priority'].apply(
                lambda x: 'Medium' if x not in self.priority_encoder.classes_ else x
            )   
            features['priority_encoded'] = self.priority_encoder.transform(features['Priority'])
    
    
        
        
        # # 7. Derived Task Features
        # features['log_est_duration'] = np.log1p(features['Estimated Duration (min)'])
        # features['sqrt_est_duration'] = np.sqrt(features['Estimated Duration (min)'])
        # features['est_duration_per_productivity'] = features['Estimated Duration (min)'] / features['Productivity Score'].fillna(0.5)
        # features['relative_duration'] = features['Estimated Duration (min)'] / features['Avg. Task Duration']
        # features['duration_ratio'] = features['Estimated Duration (min)'] / features['time_until_deadline']
       
        # 8. Additional derived features
        features['deadline_urgency'] = np.where(
            features['time_until_deadline'] < 24, 3,
            np.where(features['time_until_deadline'] < 72, 2, 1)
        )
        
        features['workload_pressure'] = features['Total Tasks'] / features['Completed Tasks'].replace(0, 1)
        features['normalized_workload'] = features['Total Tasks'] / features['Completed Tasks'].max()
        features['completion_rate'] = features['Completed Tasks'] / features['Total Tasks'].replace(0, 1)
        features['urgency_score'] = features['deadline_urgency'] * features['workload_pressure']
        features['complexity_score'] = features['Estimated Duration (min)'] * features['calendar_conflicts']
        features['time_pressure'] = features['Estimated Duration (min)'] / features['time_until_deadline']
        features['workload_complexity'] = features['Total Tasks'] * features['Avg. Task Duration']
        features['productivity_adjusted_duration'] = features['Estimated Duration (min)'] / features['Productivity Score'].fillna(0.5)
        features['tasks_per_hour'] = features['Total Tasks'] / 24
        features['available_time_ratio'] = features['time_until_deadline'] / features['Estimated Duration (min)']
        features['weekend_workload'] = features['is_weekend'] * features['Total Tasks']
        features['high_urgency'] = (features['deadline_urgency'] >= 2).astype(int)
        features['critical_deadline'] = (features['time_until_deadline'] < 24).astype(int)
        
        # Drop original datetime columns after extracting features
        # features = features.drop(columns=datetime_cols)
        
        # Handle missing values and infinities
        features = features.replace([np.inf, -np.inf], np.nan)
        numerical_columns = features.select_dtypes(include=['float64', 'int64']).columns
        features[numerical_columns] = features[numerical_columns].fillna(features[numerical_columns].mean())
        
        # Scale numerical features
        scaler = StandardScaler()
        features[numerical_columns] = scaler.fit_transform(features[numerical_columns])

 
        
        # Verify no non-numeric data remains
        non_numeric_cols = features.select_dtypes(exclude=['float64', 'int64','int32','datetime64[ns]']).columns
        if len(non_numeric_cols) > 0:
            print(f"Warning: Non-numeric columns found: {non_numeric_cols}")
            features = features.select_dtypes(include=['float64', 'int64','int32','datetime64[ns]'])
        
        # Print feature names for debugging
        print("\nProcessed features:")
        for col in features.columns:
            print(f"- {col},{features[col].dtype}")
        
        return features

    def train_models(self, data, calendar_events, reminders, event_schedule, user_settings, workload_insights):
        """Train models with comprehensive feature set and improved performance"""
        prepared_data = self.Preprocessing_Features(data, calendar_events, reminders, 
                                            event_schedule, user_settings, workload_insights)

        # Enhanced feature sets
        duration_features = [
            'time_until_deadline',
            'Estimated Duration (min)',
            'workload_pressure',
            'Productivity Score'
        ]

        # Expanded priority features
        priority_features = [
            # Task-specific features
            'Estimated Duration (min)', 
            # 'duration_ratio',
            # 'log_est_duration',
            # 'sqrt_est_duration',
            # 'relative_duration',
            
            # Time-based features
            'deadline_urgency',
            'time_until_deadline',
            
            # Workload features
            'Total Tasks',
            'Completed Tasks',
            'Avg. Task Duration',
            'workload_pressure',
            
            # User-specific features
            'Productivity Score',
            'calendar_conflicts',
            'reminder_count',
            'has_integrations',
            
            # Derived features
            'est_duration_per_productivity',
            'duration_ratio',
            
            # Interaction features
            'time_pressure',
            'workload_complexity',
            'productivity_adjusted_duration',
            
            # Time management features
            'tasks_per_hour',
            'available_time_ratio',
            
            # Weekend effect
            'weekend_workload',
            
            # Urgency indicators
            'high_urgency',
            'critical_deadline'
        ]
            

        # Store feature lists
        self.duration_features = duration_features
        self.priority_features = priority_features
        
        # Prepare feature matrices
        X_duration = prepared_data[duration_features]
        y_duration = prepared_data['actual_duration']
        
        X_priority = prepared_data[priority_features]
        data['Priority'] = data['Priority'].map(self.priority_mapping)
        y_priority = data['Priority']

        
        # Split data with stratification for priority
        X_duration_train, X_duration_test, y_duration_train, y_duration_test = train_test_split(
            X_duration, y_duration, test_size=0.2, random_state=42
        )
        
        X_priority_train, X_priority_test, y_priority_train, y_priority_test = train_test_split(
            X_priority, y_priority, test_size=0.2, random_state=42, stratify=y_priority
        )
        
        # Enhanced Duration Models with regularization
        duration_models = {
            'RandomForest': RandomForestRegressor(
                n_estimators=200,
                max_depth=10,
                min_samples_split=5,
                min_samples_leaf=4,
                random_state=42
            )
        }
        
        priority_models = {
            'RandomForest': RandomForestClassifier(
                n_estimators=500,
                max_depth=5,  # Shallower to reduce overfitting
                min_samples_split=10,
                class_weight='balanced',
                random_state=42
            ),
            'XGBoost': XGBClassifier(
                n_estimators=300,
                learning_rate=0.05,
                max_depth=6,
                random_state=42
            )
        }
        
        # Train and evaluate models with cross-validation
        print("=== Duration Prediction Models ===")
        best_duration_model, best_duration_score, best_duration_name = self._train_regression_models(
            duration_models, X_duration_train, y_duration_train, X_duration_test, y_duration_test
        )
        
        print("\n=== Priority Classification Models ===")
        best_priority_model, best_priority_score, best_priority_name = self._train_classification_models(
            priority_models, X_priority_train, y_priority_train, X_priority_test, y_priority_test
        )
        
        # Store best models
        self.duration_model = best_duration_model
        self.priority_model = best_priority_model
        self._is_fitted = True
        
        # Perform feature importance analysis
        self._analyze_feature_importance(best_duration_model, duration_features, 'Duration')
        self._analyze_feature_importance(best_priority_model, priority_features, 'Priority')
        
        # Learning curves analysis
        self._plot_learning_curves(
            best_duration_model, X_duration, y_duration, 
            f"Learning Curves - {best_duration_name} (Duration Prediction)",
            'r2'
        )
        
        self._plot_learning_curves(
            best_priority_model, X_priority, y_priority,
            f"Learning Curves - {best_priority_name} (Priority Prediction)",
            'accuracy'
        )
        
        return (best_duration_model, best_duration_score, best_duration_name,
                best_priority_model, best_priority_score, best_priority_name)

    def _analyze_feature_importance(self, model, feature_names, model_type):
        """Analyze and visualize feature importance"""
        if hasattr(model, 'feature_importances_'):
            importances = pd.Series(model.feature_importances_, index=feature_names)
            
            plt.figure(figsize=(12, 6))
            importances.sort_values(ascending=True).plot(kind='barh')
            plt.title(f'Feature Importance for {model_type} Prediction')
            plt.xlabel('Importance Score')
            plt.tight_layout()
            plt.show()
            
            print(f"\nTop 5 Most Important Features for {model_type} Prediction:")
            print(importances.sort_values(ascending=False).head())


    def analyze_model_performance(self, X, y_true, model, model_name, feature_names):
        """Comprehensive model performance analysis"""
        y_pred = model.predict(X)
        
        # Basic metrics
        mse = mean_squared_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        
        print(f"\nModel Performance Analysis - {model_name}")
        print("-" * 50)
        print(f"MSE: {mse:.2f}")
        print(f"R2 Score: {r2:.2f}")
        print(f"RMSE: {np.sqrt(mse):.2f}")
        
        # Error analysis
        errors = y_true - y_pred
        plt.figure(figsize=(15, 5))
        
        # Error distribution
        plt.subplot(1, 2, 1)
        plt.hist(errors, bins=50)
        plt.title('Error Distribution')
        plt.xlabel('Prediction Error')
        plt.ylabel('Count')
        
        # Actual vs Predicted
        plt.subplot(1, 2, 2)
        plt.scatter(y_true, y_pred, alpha=0.5)
        plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
        plt.title('Actual vs Predicted')
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        
        plt.tight_layout()
        plt.show()

    def predict_task_duration(self, task, calendar_events, reminders, event_schedule, user_settings, workload_insights):
        """Predict task duration with enhanced features"""
        if not self._is_fitted:
            raise ValueError("Model must be trained before making predictions")
            
        # Prepare single task features
        task_df = pd.DataFrame([task])
        prepared_features = self._prepare_features(task_df, calendar_events, reminders,
                                                 event_schedule, user_settings, workload_insights)
        
        X = prepared_features[self.duration_features]
        return float(self.duration_model.predict(X)[0])


    def predict_task_priority(self, task, calendar_events, reminders, event_schedule, user_settings, workload_insights):
        """Predict task priority with enhanced features"""
        if not self._is_fitted:
            raise ValueError("Model must be trained before making predictions")
            
        # Prepare single task features
        task_df = pd.DataFrame([task])
        prepared_features = self._prepare_features(
            task_df,
            calendar_events,
            reminders,
            event_schedule,
            user_settings,
            workload_insights
        )
        
        X = prepared_features[self.priority_features]
        priority_pred = self.priority_model.predict(X)[0]
        
        # Return numerical value instead of string
        return self.priority_mapping.get(
            self.priority_encoder.inverse_transform([priority_pred])[0], 2
        )  # Default to Medium (2) if mapping fails

    def _train_regression_models(self, models, X_train, y_train, X_test, y_test):
        """Train and evaluate regression models"""
        best_model = None
        best_score = float('-inf')
        best_model_name = ""
        
        print("\nDuration Model Performance:")
        print("-" * 50)
        
        for name, model in models.items():
            # Train model
            model.fit(X_train, y_train)
            
            # Evaluate
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            
            # Cross-validation
            cv_scores = cross_val_score(model, X_train, y_train, 
                                    cv=5, scoring='r2')
            cv_r2 = cv_scores.mean()
            
            print(f"\n{name}:")
            print(f"  MSE: {mse:.2f}")
            print(f"  R2: {r2:.2f}")
            print(f"  CV R2: {cv_r2:.2f} (±{cv_scores.std()*2:.2f})")
            
            # Track best model
            if r2 > best_score:
                best_score = r2
                best_model = model
                best_model_name = name
        
        print("\n" + "=" * 50)
        print(f"Best Duration Model: {best_model_name}")
        print(f"Best R2 Score: {best_score:.2f}")
        print("=" * 50)
        
        return best_model, best_score, best_model_name
    

    def _train_classification_models(self, models, X_train, y_train, X_test, y_test):
        """Train and evaluate multiple classification models with hyperparameter tuning"""
        
        # Initialize models dictionary with more classifiers
        models = {
            'RandomForest': RandomForestClassifier(random_state=42),
            'XGBoost': XGBClassifier(random_state=42),
            'LightGBM': LGBMClassifier(random_state=42),
            'SVM': SVC(random_state=42),
            'LogisticRegression': LogisticRegression(random_state=42),
            'KNeighbors': KNeighborsClassifier()
        }

        # Define parameter grids for each model
        param_grids = {
            'RandomForest': {
                'n_estimators': [300, 500],
                'max_depth': [5, 7, None],
                'min_samples_split': [5, 10],
                'min_samples_leaf': [2, 4],
                'class_weight': ['balanced', 'balanced_subsample']
            },
            'XGBoost': {
                'n_estimators': [300, 500],
                'max_depth': [5, 7],
                'learning_rate': [0.01, 0.1],
                'subsample': [0.8, 1.0],
                'colsample_bytree': [0.8, 1.0]
            },
            'LightGBM': {
                'n_estimators': [300, 500],
                'max_depth': [5, 7, -1],
                'learning_rate': [0.01, 0.1],
                'num_leaves': [31, 127],
                'subsample': [0.8, 1.0]
            },
            'SVM': {
                'C': [0.1, 1, 10],
                'kernel': ['rbf', 'linear'],
                'gamma': ['scale', 'auto']
            },
            'LogisticRegression': {
                'C': [0.1, 1, 10],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear', 'saga']
            },
            'KNeighbors': {
                'n_neighbors': [3, 5, 7],
                'weights': ['uniform', 'distance'],
                'metric': ['euclidean', 'manhattan']
            }
        }

        best_model = None
        best_score = 0
        best_model_name = ""
        results_dict = {}

        # Convert X_train and X_test to numpy arrays if they're DataFrames
        if isinstance(X_train, pd.DataFrame):
            X_train = X_train.values
        if isinstance(X_test, pd.DataFrame):
            X_test = X_test.values

        # Scale features for specific algorithms
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        for name, model in models.items():
            print(f"\n{'-'*50}")
            print(f"Training {name} model...")
            
            # Use scaled data for specific algorithms
            if name in ['SVM', 'LogisticRegression', 'KNeighbors']:
                X_train_use = X_train_scaled
                X_test_use = X_test_scaled
            else:
                X_train_use = X_train
                X_test_use = X_test

            try:
                # Create GridSearchCV object
                grid_search = GridSearchCV(
                    estimator=model,
                    param_grid=param_grids[name],
                    scoring=['accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted'],
                    refit='f1_weighted',
                    cv=5,
                    n_jobs=-1,
                    verbose=1
                )

                # Fit GridSearchCV
                grid_search.fit(X_train_use, y_train)

                # Get best model and predictions
                best_params = grid_search.best_params_
                best_estimator = grid_search.best_estimator_
                y_pred = best_estimator.predict(X_test_use)

                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, average='weighted')
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')

                # Store results
                results_dict[name] = {
                    'model': best_estimator,
                    'best_params': best_params,
                    'accuracy': accuracy,
                    'f1_score': f1,
                    'precision': precision,
                    'recall': recall
                }

                # Print results
                print(f"\n{name} Results:")
                print("Best Parameters:", best_params)
                print(f"Best CV Score (F1): {grid_search.best_score_:.3f}")
                print(f"Test Set Metrics:")
                print(f"  Accuracy:  {accuracy:.3f}")
                print(f"  F1 Score:  {f1:.3f}")
                print(f"  Precision: {precision:.3f}")
                print(f"  Recall:    {recall:.3f}")

                # Print classification report
                print("\nClassification Report:")
                print(classification_report(y_test, y_pred))

                # Plot confusion matrix
                plt.figure(figsize=(8, 6))
                cm = confusion_matrix(y_test, y_pred)
                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
                plt.title(f'Confusion Matrix - {name}')
                plt.ylabel('True Label')
                plt.xlabel('Predicted Label')
                plt.show()

                # Plot ROC curve if applicable
                if hasattr(best_estimator, "predict_proba"):
                    RocCurveDisplay.from_estimator(best_estimator, X_test_use, y_test)
                    plt.title(f'ROC Curve - {name}')
                    plt.show()

                # Track best model
                if f1 > best_score:
                    best_score = f1
                    best_model = best_estimator
                    best_model_name = name

            except Exception as e:
                print(f"Error training {name} model: {str(e)}")
                continue

        if best_model is None:
            raise ValueError("No models were successfully trained")

        # Print comparative analysis
        print("\nComparative Analysis of All Models:")
        comparison_df = pd.DataFrame({
            model_name: {
                'Accuracy': results['accuracy'],
                'F1 Score': results['f1_score'],
                'Precision': results['precision'],
                'Recall': results['recall']
            }
            for model_name, results in results_dict.items()
        }).T

        print("\nModel Performance Comparison:")
        print(comparison_df)

        # Plot comparative bar chart
        plt.figure(figsize=(12, 6))
        comparison_df.plot(kind='bar', width=0.8)
        plt.title('Model Performance Comparison')
        plt.xlabel('Models')
        plt.ylabel('Score')
        plt.xticks(rotation=45)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.show()

        print("\n" + "=" * 50)
        print(f"Best Model: {best_model_name}")
        print(f"Best F1 Score: {best_score:.3f}")
        print("=" * 50)

        return best_model, best_score, best_model_name

    def plot_roc_curve(model, X_test, y_test):
        """Plot ROC curve for multi-class classification"""
        y_pred_proba = model.predict_proba(X_test)
        n_classes = len(np.unique(y_test))
        
        # Plot ROC curve for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test == i, y_pred_proba[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        
        # Plot all ROC curves
        plt.figure(figsize=(8, 6))
        colors = cycle(['blue', 'red', 'green'])
        for i, color in zip(range(n_classes), colors):
            plt.plot(fpr[i], tpr[i], color=color, lw=2,
                    label=f'ROC curve of class {i} (area = {roc_auc[i]:0.2f})')
        
        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc="lower right")

    # def _train_classification_models(self, models, X_train, y_train, X_test, y_test):
    #     """Train and evaluate classification models"""
    #     best_model = None
    #     best_score = 0
    #     best_model_name = ""
        
    #     for name, model in models.items():
    #         # Train model
    #         model.fit(X_train, y_train)
            
    #         # Evaluate
    #         y_pred = model.predict(X_test)
    #         accuracy = accuracy_score(y_test, y_pred)
            
    #         # Cross-validation
    #         cv_scores = cross_val_score(model, X_train, y_train, 
    #                                 cv=5, scoring='accuracy')
    #         cv_accuracy = cv_scores.mean()
    #         # Add F1 score evaluation
    #         f1 = f1_score(y_test, y_pred, average='weighted')
    #         print(f"  F1 Score: {f1:.2f}")
            
    #         # Add confusion matrix
    #         print("  Confusion Matrix:")
    #         print(confusion_matrix(y_test, y_pred))
            
    #         print(f"\n{name}:")
    #         print(f"  Accuracy: {accuracy:.2f}")
    #         print(f"  CV Accuracy: {cv_accuracy:.2f} (±{cv_scores.std()*2:.2f})")
    #         print("  Classification Report:")
    #         print(classification_report(y_test, y_pred))
            
    #         # Print class distribution
    #         unique, counts = np.unique(y_train, return_counts=True)
    #         print("  Class Distribution in Training Data:")
    #         for u, c in zip(unique, counts):
    #             priority_label = self.priority_encoder.classes_[u]
    #             print(f"    Class {u} (Priority {priority_label}): {c} samples")
            
    #         # Track best model
    #         if accuracy > best_score:
    #             best_score = accuracy
    #             best_model = model
    #             best_model_name = name
        
    #     print("\n" + "=" * 50)
    #     print(f"Best Priority Model: {best_model_name}")
    #     print(f"Best Accuracy Score: {best_score:.2f}")
    #     print("=" * 50)
        
    #     return best_model, best_score, best_model_name
    
    def _plot_learning_curves(self, model, X, y, title, scoring):
        """Plot learning curves to analyze model performance"""
        train_sizes, train_scores, val_scores = learning_curve(
            model, X, y,
            train_sizes=np.linspace(0.1, 1.0, 10),
            cv=5,
            scoring=scoring,
            n_jobs=-1
        )
        
        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        val_mean = np.mean(val_scores, axis=1)
        val_std = np.std(val_scores, axis=1)
        
        plt.figure(figsize=(10, 6))
        plt.plot(train_sizes, train_mean, label='Training score')
        plt.plot(train_sizes, val_mean, label='Cross-validation score')
        
        plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
        plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1)
        
        plt.title(title)
        plt.xlabel('Training Examples')
        plt.ylabel('Score')
        plt.legend(loc='best')
        plt.grid(True)
        plt.show()

In [None]:

# Load your datasets (replace with your actual file paths)
calendar_events = pd.read_csv("calendar_events.csv")
reminders = pd.read_csv("reminders_followups.csv")
event_schedule = pd.read_csv("event_schedule.csv")
user_settings = pd.read_csv("user_settings.csv")
workload_insights = pd.read_csv("workload_insights.csv")
user_tasks = pd.read_csv("user_tasks.csv")

# Prepare training data
ml_training_data = user_tasks[['User ID', 'Priority', 'Estimated Duration (min)', 'Deadline', 'Created At', 'Status']].copy()

# def convert_minutes_to_datetime(minutes_str):
#     try:
#         # Split the string and convert to float
#         total_minutes, seconds = map(float, minutes_str.split(':'))
        
#         # Convert total minutes to hours and minutes
#         hours = int(total_minutes // 60)
#         minutes = int(total_minutes % 60)
        
#         # Combine with seconds
#         time_str = f"{hours:02}:{minutes:02}:{int(seconds):02}"
        
#         # Convert to datetime
#         return pd.to_datetime(time_str, format='%H:%M:%S')
#     except:
#         # If conversion fails, return NaT
#         return pd.NaT



# # Convert 'Deadline' and 'Created At' columns to datetime
# ml_training_data['Deadline'] = ml_training_data['Deadline'].apply(convert_minutes_to_datetime)
# ml_training_data['Created At'] = ml_training_data['Created At'].apply(convert_minutes_to_datetime)

# ml_training_data['Deadline'] = ml_training_data['Deadline'].dt.time
# ml_training_data['Created At'] = ml_training_data['Created At'].dt.time


# # Calculate 'actual_duration' in minutes
# ml_training_data['actual_duration'] = (ml_training_data['Deadline'] - ml_training_data['Created At']).dt.total_seconds() / 60

# # Clean data
# ml_training_data = ml_training_data.dropna(subset=['actual_duration', 'Priority', 'Estimated Duration (min)'])
# ml_training_data = ml_training_data[ml_training_data['actual_duration'] > 0]



# Initialize and train the model
ml_model = EnhancedMLModel()

# Train models and get results
(best_duration_model, 
best_duration_score, 
best_duration_name,
best_priority_model,
best_priority_score,
best_priority_name) = ml_model.train_models(
ml_training_data,
calendar_events,
reminders,
event_schedule,
user_settings,
workload_insights
)

# Print final results
print("\n" + "="*50)
print("Final Model Selection Results")
print("="*50)
print(f"\nBest Duration Prediction Model: {best_duration_name}")
print(f"R2 Score: {best_duration_score:.4f}")
print(f"\nBest Priority Classification Model: {best_priority_name}")
print(f"Accuracy Score: {best_priority_score:.4f}")
print("\n" + "="*50)

# Store the trained models for later use
ml_model.duration_model = best_duration_model
ml_model.priority_model = best_priority_model
ml_model._is_fitted = True

# # Example of making predictions with the trained models
# sample_task = {
#     'User ID': user_settings['User ID'].iloc[0],
#     'Priority': 'High',
#     'Estimated Duration (min)': 120,
#     'Deadline': pd.Timestamp.now() + pd.Timedelta(hours=2),
#     'Created At': pd.Timestamp.now(),
#     'Status': 'Pending'
# }

# try:
#     predicted_duration = ml_model.predict_task_duration(
#         sample_task,
#         calendar_events,
#         reminders,
#         event_schedule,
#         user_settings,
#         workload_insights
#     )
#     print(f"\nSample Task Duration Prediction: {predicted_duration:.2f} minutes")

#     predicted_priority = ml_model.predict_task_priority(
#         sample_task,
#         calendar_events,
#         reminders,
#         event_schedule,
#         user_settings,
#         workload_insights
#     )
#     print(f"Sample Task Priority Prediction: {ml_model.priority_reverse_mapping[predicted_priority]}")
# except Exception as e:
#     print(f"\nPrediction error: {str(e)}")