In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os

: 

In [None]:






# User loader for Flask-Login
@login_manager.user_loader
def load_user(user_id):
    return User.query.get(int(user_id))

# Initialize database and models on first run
with app.app_context():
    init_database(app)
    
    # Load existing models or train new ones
    if not model.load_models():
        print("No existing models found. Training on sample data...")
        DATA_FILE_PATH = 'data/credit_data.csv'
        if not os.path.exists(DATA_FILE_PATH):
            generate_and_save_data(DATA_FILE_PATH)
        df = pd.read_csv(DATA_FILE_PATH)
        model.train(df, source='initial_sample')

# --- CONFIGURATION ---
REQUIRED_DOCS = ['doc_id', 'doc_salary', 'doc_bank_statement', 'doc_tax_return', 'doc_property_docs']

# --- LANGUAGE & CONTEXT ---
@app.before_request
def before_request():
    if 'language' not in session:
        session['language'] = 'en'

@app.context_processor
def inject_get_text():
    def get_text(key):
        return _get_text(session.get('language', 'en'), key)
    return dict(get_text=get_text)

@app.route('/change_language/<lang>')
def change_language(lang):
    if lang in ['en', 'es']:
        session['language'] = lang
    return redirect(request.referrer or url_for('dashboard'))

# --- MAIN ROUTES ---
@app.route('/')
def index():
    """Redirect to login or dashboard"""
    if current_user.is_authenticated:
        return redirect(url_for('dashboard'))
    return redirect(url_for('auth.login'))

@app.route('/dashboard')
@login_required
def dashboard():
    """Main dashboard with user-specific data and model info"""
    # Get applications for current user (or all if admin)
    if current_user.role.value == 'admin':
        applications = Application.query.all()
    else:
        applications = current_user.applications.all()
    
    # Convert to DataFrame for analysis
    if applications and len(applications) > 0:
        df_data = []
        for app in applications:
            df_data.append({
                'Application_ID': app.application_id,
                'Application_Date': app.application_date or datetime.utcnow(),
                'Age': app.age or 35,
                'Gender': app.gender or 'Male',
                'Monthly_Income': app.monthly_income or 35000,
                'Credit_Score': app.credit_score or 650,
                'DTI_Ratio': app.dti_ratio or 0.35,
                'Employment_Status': app.employment_status or 'Employed',
                'Processing_Time_Days': app.processing_time_days or 15,
                'Status': app.status or 'In-Process'
            })
        df_active = pd.DataFrame(df_data)
    else:
        # Use sample data if no real data exists
        try:
            df_active = pd.read_csv('data/credit_data.csv')
            if 'Application_Date' in df_active.columns:
                df_active['Application_Date'] = pd.to_datetime(df_active['Application_Date'])
        except:
            # Create minimal sample data
            df_active = pd.DataFrame({
                'Application_ID': ['SAMPLE-001', 'SAMPLE-002', 'SAMPLE-003'],
                'Application_Date': [datetime.utcnow() - timedelta(days=i*10) for i in range(3)],
                'Age': [35, 42, 28],
                'Gender': ['Male', 'Female', 'Male'],
                'Monthly_Income': [45000, 62000, 38000],
                'Credit_Score': [720, 680, 750],
                'DTI_Ratio': [0.32, 0.28, 0.45],
                'Employment_Status': ['Employed', 'Employed', 'Self-Employed'],
                'Processing_Time_Days': [15, 22, 18],
                'Status': ['Approved', 'In-Process', 'Approved']
            })
    
    # Calculate KPIs - Ensure values are calculated properly
    total_apps = len(df_active)
    approved_count = len(df_active[df_active['Status'] == 'Approved'])
    declined_count = len(df_active[df_active['Status'] == 'Declined'])
    
    kpis = {
        'total_apps': f"{total_apps:,}",
        'approval_rate': f"{(approved_count / total_apps * 100) if total_apps > 0 else 0:.1f}%",
        'rejection_rate': f"{(declined_count / total_apps * 100) if total_apps > 0 else 0:.1f}%",
        'avg_processing_time': f"{df_active['Processing_Time_Days'].mean() if total_apps > 0 else 0:.1f} days"
    }
    
    # Generate graphs - ensure data exists
    graphs = {}
    try:
        if len(df_active) > 0:
            graphs['trends'] = pio.to_json(create_trends_chart(df_active))
            graphs['funnel'] = pio.to_json(create_funnel_chart(df_active))
            
            if len(df_active) > 5:  # Need minimum data for correlation
                graphs['heatmap'] = pio.to_json(create_correlation_heatmap(df_active))
            else:
                graphs['heatmap'] = None
                
            graphs['box_plot'] = pio.to_json(create_box_plot(df_active))
            graphs['sunburst'] = pio.to_json(create_sunburst_chart(df_active))
        else:
            graphs = {
                'trends': None,
                'funnel': None,
                'heatmap': None,
                'box_plot': None,
                'sunburst': None
            }
    except Exception as e:
        print(f"ERROR generating charts: {str(e)}")
        graphs = {
            'trends': None,
            'funnel': None,
            'heatmap': None,
            'box_plot': None,
            'sunburst': None
        }
    
    # Get model information
    model_info = None
    try:
        info = model.get_model_info()
        if info.get('last_metrics'):
            model_info = {
                'last_trained': info.get('training_history', [{}])[-1].get('timestamp', 'Never') if info.get('training_history') else 'Never',
                'accuracy': round(info['last_metrics'].get('approval', {}).get('accuracy', 0) * 100, 1),
                'records_used': info.get('training_history', [{}])[-1].get('records', 0) if info.get('training_history') else 0
            }
    except Exception as e:
        app.logger.warning(f"Could not load model info: {e}")
    
    # Get recent applications for current user
    recent_apps = current_user.applications.order_by(Application.application_date.desc()).limit(5).all()
    
    return render_template('dashboard.html', 
                         kpis=kpis, 
                         graphs=graphs, 
                         recent_apps=recent_apps,
                         user=current_user,
                         model_info=model_info)

@app.route('/my_clients')
@login_required

# Training Data Management

## Overview
The system now automatically saves all training data used for model training sessions. This ensures data provenance, model reproducibility, and compliance with audit requirements.

## Data Storage Strategy

### Automatic Data Saving
- Every training session saves the complete dataset to `data/` directory
- Files are named with format: `training_data_{source}_{timestamp}.csv`
- Training history includes references to the specific data files used

### File Naming Convention
- `training_data_database_20250911_143052.csv` - Database training session
- `training_data_csv_20250911_143052.csv` - CSV upload training session  
- `training_data_sample_20250911_143052.csv` - Generated data training session

### Benefits
1. **Reproducibility**: Exact training data can be retrieved for any model version
2. **Audit Trail**: Complete record of what data was used for training
3. **Data Quality**: Preserved datasets for quality analysis and debugging
4. **Compliance**: Meeting regulatory requirements for model documentation

In [None]:
# Example: Accessing Training Data History
import sys
sys.path.append('../')

from model_pipeline import model
import pandas as pd
import os
from datetime import datetime

# Get model training history
history = model.get_model_info()['training_history']

print("Training History:")
print("=" * 50)
for i, session in enumerate(history):
    print(f"Session {i+1}:")
    print(f"  Timestamp: {session['timestamp']}")
    print(f"  Source: {session['source']}")
    print(f"  Records: {session['records']}")
    print(f"  Data File: {session.get('training_data_file', 'Not saved')}")
    if 'metrics' in session:
        print(f"  Approval Accuracy: {session['metrics']['approval']['accuracy']:.3f}")
        print(f"  Withdrawal Accuracy: {session['metrics']['withdrawal']['accuracy']:.3f}")
    print()

# List all training data files
data_dir = '../data'
if os.path.exists(data_dir):
    training_files = [f for f in os.listdir(data_dir) if f.startswith('training_data_')]
    print(f"Available Training Data Files ({len(training_files)}):")
    print("=" * 50)
    for file in sorted(training_files):
        file_path = os.path.join(data_dir, file)
        size_mb = os.path.getsize(file_path) / (1024*1024)
        print(f"  {file} ({size_mb:.2f} MB)")
else:
    print("Data directory not found")

In [None]:
# Example: Loading and Analyzing Training Data
def analyze_training_data(filename):
    """Analyze a specific training data file"""
    file_path = os.path.join('../data', filename)
    
    if not os.path.exists(file_path):
        print(f"File not found: {filename}")
        return
    
    # Load the data
    df = pd.read_csv(file_path)
    
    print(f"Training Data Analysis: {filename}")
    print("=" * 60)
    print(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
    print(f"Date Range: {df['Application_Date'].min()} to {df['Application_Date'].max()}")
    print()
    
    # Status distribution
    print("Status Distribution:")
    status_counts = df['Status'].value_counts()
    for status, count in status_counts.items():
        percentage = (count / len(df)) * 100
        print(f"  {status}: {count} ({percentage:.1f}%)")
    print()
    
    # Key statistics
    print("Key Statistics:")
    print(f"  Average Credit Score: {df['Credit_Score'].mean():.0f}")
    print(f"  Average Monthly Income: Q{df['Monthly_Income'].mean():,.0f}")
    print(f"  Average DTI Ratio: {df['DTI_Ratio'].mean():.3f}")
    print(f"  Average Loan Amount: Q{df['Loan_Amount'].mean():,.0f}")
    print()
    
    return df

# Example usage - replace with actual filename
# df = analyze_training_data('training_data_sample_20250911_143052.csv')