# HABITUS FAITH - ML Training Pipeline (Optimized)

Improved version with better archetype separation and model architecture

## Expected outputs
- predictor.tflite
- scaler_params.json
- feature_config.json

## Estimated time: 5-10 minutes

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import json
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def generate_synthetic_training_data(n_records=5000):
    """Generate synthetic data with clear archetype separation"""
    np.random.seed(42)
    records = []
    
    archetypes = {
        'excellent_morning': {
            'weight': 0.20,
            'hour_range': (6, 9),
            'hour_std': 0.5,
            'preferred_days': [1, 2, 3, 4, 5],
            'streak_range': (30, 90),
            'failure_range': (0, 1),
            'reminder_adherence': 0.95,
            'abandonment_rate': 0.02
        },
        'consistent_afternoon': {
            'weight': 0.20,
            'hour_range': (13, 16),
            'hour_std': 1.0,
            'preferred_days': [1, 2, 3, 4, 5, 6, 7],
            'streak_range': (15, 45),
            'failure_range': (1, 2),
            'reminder_adherence': 0.75,
            'abandonment_rate': 0.15
        },
        'moderate_evening': {
            'weight': 0.20,
            'hour_range': (18, 21),
            'hour_std': 1.5,
            'preferred_days': [1, 2, 3, 4, 5, 6, 7],
            'streak_range': (8, 25),
            'failure_range': (2, 4),
            'reminder_adherence': 0.55,
            'abandonment_rate': 0.40
        },
        'struggling_night': {
            'weight': 0.20,
            'hour_range': (20, 23),
            'hour_std': 1.5,
            'preferred_days': [5, 6, 7],
            'streak_range': (1, 10),
            'failure_range': (4, 6),
            'reminder_adherence': 0.30,
            'abandonment_rate': 0.75
        },
        'inconsistent_weekend': {
            'weight': 0.20,
            'hour_range': (8, 14),
            'hour_std': 3.0,
            'preferred_days': [6, 7],
            'streak_range': (2, 12),
            'failure_range': (3, 5),
            'reminder_adherence': 0.35,
            'abandonment_rate': 0.65
        }
    }
    
    for i in range(n_records):
        archetype_name = np.random.choice(
            list(archetypes.keys()),
            p=[arch['weight'] for arch in archetypes.values()]
        )
        archetype = archetypes[archetype_name]
        
        hour_mean = np.mean(archetype['hour_range'])
        hour = np.clip(np.random.normal(hour_mean, archetype['hour_std']), 0, 23)
        
        day = np.random.choice(archetype['preferred_days'])
        streak = np.random.randint(archetype['streak_range'][0], archetype['streak_range'][1] + 1)
        failures = np.random.randint(archetype['failure_range'][0], archetype['failure_range'][1] + 1)
        
        if np.random.random() < archetype['reminder_adherence']:
            hours_from_reminder = np.random.uniform(0, 2)
        else:
            hours_from_reminder = np.random.uniform(2, 12)
        
        base_risk = archetype['abandonment_rate']
        abandoned = 1 if np.random.random() < base_risk else 0
        
        records.append({
            'hourOfDay': hour,
            'dayOfWeek': day,
            'streakAtTime': streak,
            'failuresLast7Days': failures,
            'hoursFromReminder': hours_from_reminder,
            'abandoned': abandoned
        })
    
    df = pd.DataFrame(records)
    return df

In [None]:
print('Generating 5000 synthetic training records...')
df = generate_synthetic_training_data(5000)
print(f'Generated {len(df)} records')
print(f'\nClass distribution:')
print(f"   - Completed (0): {(~df['abandoned'].astype(bool)).sum()} ({(~df['abandoned'].astype(bool)).sum()/len(df)*100:.1f}%)")
print(f"   - Abandoned (1): {df['abandoned'].sum()} ({df['abandoned'].sum()/len(df)*100:.1f}%)")
print(f'\nFirst 5 records:')
print(df.head())

In [None]:
print('\nFeature Statistics:')
print(df.describe())

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

axes[0, 0].hist([df[df['abandoned']==0]['hourOfDay'], df[df['abandoned']==1]['hourOfDay']], 
                bins=24, label=['Completed', 'Abandoned'], alpha=0.7)
axes[0, 0].set_title('Completion Hour Distribution')
axes[0, 0].set_xlabel('Hour of Day')
axes[0, 0].legend()

axes[0, 1].hist([df[df['abandoned']==0]['dayOfWeek'], df[df['abandoned']==1]['dayOfWeek']], 
                bins=7, label=['Completed', 'Abandoned'], alpha=0.7)
axes[0, 1].set_title('Day of Week Distribution')
axes[0, 1].set_xlabel('Day (1=Mon, 7=Sun)')
axes[0, 1].legend()

axes[0, 2].hist([df[df['abandoned']==0]['streakAtTime'], df[df['abandoned']==1]['streakAtTime']], 
                bins=20, label=['Completed', 'Abandoned'], alpha=0.7)
axes[0, 2].set_title('Streak Distribution')
axes[0, 2].set_xlabel('Streak Length')
axes[0, 2].legend()

axes[1, 0].hist([df[df['abandoned']==0]['failuresLast7Days'], df[df['abandoned']==1]['failuresLast7Days']], 
                bins=8, label=['Completed', 'Abandoned'], alpha=0.7)
axes[1, 0].set_title('Failures Last 7 Days')
axes[1, 0].set_xlabel('Number of Failures')
axes[1, 0].legend()

axes[1, 1].hist([df[df['abandoned']==0]['hoursFromReminder'], df[df['abandoned']==1]['hoursFromReminder']], 
                bins=12, label=['Completed', 'Abandoned'], alpha=0.7)
axes[1, 1].set_title('Hours From Reminder')
axes[1, 1].set_xlabel('Hours')
axes[1, 1].legend()

sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm', ax=axes[1, 2])
axes[1, 2].set_title('Feature Correlation')

plt.tight_layout()
plt.show()

In [None]:
print('='*60)
print('TRAINING IMPROVED MODEL')
print('='*60)

feature_cols = ['hourOfDay', 'dayOfWeek', 'streakAtTime', 'failuresLast7Days', 'hoursFromReminder']
X = df[feature_cols].values
y = df['abandoned'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f'\nData split:')
print(f'   Training: {len(X_train)} samples')
print(f'   Testing: {len(X_test)} samples')

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'\nBuilding Neural Network with Dropout...')
keras_model = Sequential([
    Dense(32, activation='relu', input_shape=(5,)),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

keras_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

keras_model.summary()

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

print(f'\nTraining...')
history = keras_model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

test_loss, test_accuracy, test_auc = keras_model.evaluate(X_test_scaled, y_test, verbose=0)
print(f'\n{"="*60}')
print(f'Test Accuracy: {test_accuracy:.2%}')
print(f'Test AUC: {test_auc:.4f}')
print(f'Test Loss: {test_loss:.4f}')
print(f'{"="*60}')

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
test_scenarios = [
    ([7.0, 1, 45, 0, 1], "Excellent morning (low risk)"),
    ([14.0, 3, 20, 1, 2], "Consistent afternoon (low risk)"),
    ([19.0, 5, 10, 3, 4], "Moderate evening (medium risk)"),
    ([21.0, 6, 5, 5, 8], "Struggling night (high risk)"),
    ([11.0, 7, 3, 4, 10], "Inconsistent weekend (high risk)")
]

print(f'\n{"="*60}')
print('TEST PREDICTIONS (Keras Model):')
print(f'{"="*60}')
for features, desc in test_scenarios:
    features_scaled = scaler.transform([features])
    pred = keras_model.predict(features_scaled, verbose=0)[0][0]
    print(f'{desc:40} → {pred:.4f}')

In [None]:
print('\n' + '='*60)
print('EXPORTING TO TFLITE')
print('='*60)

converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

with open('predictor.tflite', 'wb') as f:
    f.write(tflite_model)

tflite_size_kb = len(tflite_model) / 1024
print(f'✅ TFLite model saved: {tflite_size_kb:.2f} KB')

scaler_params = {
    'mean': scaler.mean_.tolist(),
    'scale': scaler.scale_.tolist()
}

with open('scaler_params.json', 'w') as f:
    json.dump(scaler_params, f, indent=2)

print(f'✅ Scaler params saved')

feature_config = {
    'features': feature_cols
}

with open('feature_config.json', 'w') as f:
    json.dump(feature_config, f, indent=2)

print(f'✅ Feature config saved')

interpreter = tf.lite.Interpreter(model_content=tflite_model)
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print(f'\nTFLite Input shape: {input_details[0]["shape"]}')
print(f'TFLite Output shape: {output_details[0]["shape"]}')

print(f'\n{"="*60}')
print('TFLITE VERIFICATION:')
print(f'{"="*60}')
for features, desc in test_scenarios:
    features_scaled = scaler.transform([features]).astype(np.float32)
    interpreter.set_tensor(input_details[0]['index'], features_scaled)
    interpreter.invoke()
    output = interpreter.get_tensor(output_details[0]['index'])[0][0]
    print(f'{desc:40} → {output:.4f}')

print(f'\n{"="*60}')
print('EXPORT COMPLETE!')
print(f'{"="*60}')
print(f'\n📦 Output files:')
print(f'   1. predictor.tflite ({tflite_size_kb:.2f} KB)')
print(f'   2. scaler_params.json')
print(f'   3. feature_config.json')

In [None]:
from google.colab import files
print('\nDownloading files...')
files.download('predictor.tflite')
files.download('scaler_params.json')
files.download('feature_config.json')
print('\n✅ Downloads initiated! Check your browser download folder.')

# Next Steps

## 1. Copy files to Flutter project

```
assets/ml_models/
├── predictor.tflite
├── scaler_params.json
└── feature_config.json
```

## 2. Run integration test

```bash
flutter test integration_test/ml/ml_prediction_flow_test.dart -d YOUR_DEVICE_ID
```

## 3. Expected test results

- Low-risk scenarios: 0.05-0.20
- Medium-risk scenarios: 0.35-0.55
- High-risk scenarios: 0.70-0.85

## 4. Future retraining

When you have ≥500 real user completions in Firestore, export data and retrain for better accuracy.