# HABITUS FAITH - ML Training Pipeline (Google Colab)

Complete notebook for training abandonment prediction model

## Requirements
- None, runs in Colab

## Expected outputs
- predictor.tflite
- scaler_params.json

## Estimated time: 5-10 minutes

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow import keras
import json
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

In [None]:
def generate_synthetic_training_data(n_records=300):
    np.random.seed(42)
    records = []
    
    archetypes = {
        'successful_weekday_morning': {
            'weight': 0.25,
            'hour_range': (6, 9),
            'preferred_days': [1, 2, 3, 4, 5],
            'streak_range': (10, 30),
            'failure_range': (0, 2),
            'abandonment_rate': 0.10
        },
        'struggling_evening': {
            'weight': 0.20,
            'hour_range': (20, 23),
            'preferred_days': [1, 2, 3, 4, 5, 6, 7],
            'streak_range': (3, 8),
            'failure_range': (3, 5),
            'abandonment_rate': 0.60
        },
        'weekend_failer': {
            'weight': 0.15,
            'hour_range': (10, 22),
            'preferred_days': [6, 7],
            'streak_range': (5, 15),
            'failure_range': (2, 4),
            'abandonment_rate': 0.75
        },
        'inconsistent': {
            'weight': 0.25,
            'hour_range': (7, 21),
            'preferred_days': [1, 2, 3, 4, 5, 6, 7],
            'streak_range': (0, 5),
            'failure_range': (4, 7),
            'abandonment_rate': 0.50
        },
        'highly_motivated': {
            'weight': 0.15,
            'hour_range': (6, 8),
            'preferred_days': [1, 2, 3, 4, 5, 6, 7],
            'streak_range': (15, 30),
            'failure_range': (0, 1),
            'abandonment_rate': 0.05
        }
    }
    
    for i in range(n_records):
        archetype_name = np.random.choice(
            list(archetypes.keys()),
            p=[arch['weight'] for arch in archetypes.values()]
        )
        archetype = archetypes[archetype_name]
        
        hour = np.random.randint(archetype['hour_range'][0], archetype['hour_range'][1] + 1)
        day = np.random.choice(archetype['preferred_days'])
        streak = np.random.randint(archetype['streak_range'][0], archetype['streak_range'][1] + 1)
        failures = np.random.randint(archetype['failure_range'][0], archetype['failure_range'][1] + 1)
        hours_from_reminder = np.random.randint(0, 12)
        
        base_risk = archetype['abandonment_rate']
        if hour >= 22:
            base_risk += 0.15
        if day in [6, 7]:
            base_risk += 0.10
        if failures >= 5:
            base_risk += 0.20
        if streak <= 2:
            base_risk += 0.15
        if hours_from_reminder > 8:
            base_risk += 0.10
        
        abandoned = 1 if np.random.random() < base_risk else 0
        
        records.append({
            'hourOfDay': hour,
            'dayOfWeek': day,
            'streakAtTime': streak,
            'failuresLast7Days': failures,
            'hoursFromReminder': hours_from_reminder,
            'abandoned': abandoned
        })
    
    df = pd.DataFrame(records)
    return df

In [None]:
print('Generating 300 synthetic training records...')
df = generate_synthetic_training_data(300)
print(f'Generated {len(df)} records')
print(f'\nClass distribution:')
print(f"   - Completed (0): {(~df['abandoned'].astype(bool)).sum()} ({(~df['abandoned'].astype(bool)).sum()/len(df)*100:.1f}%)")
print(f"   - Abandoned (1): {df['abandoned'].sum()} ({df['abandoned'].sum()/len(df)*100:.1f}%)")
print(f'\nFirst 5 records:')
print(df.head())

In [None]:
print('\nFeature Statistics:')
print(df.describe())

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

axes[0, 0].hist([df[df['abandoned']==0]['hourOfDay'], df[df['abandoned']==1]['hourOfDay']], 
                bins=24, label=['Completed', 'Abandoned'], alpha=0.7)
axes[0, 0].set_title('Completion Hour Distribution')
axes[0, 0].set_xlabel('Hour of Day')
axes[0, 0].legend()

axes[0, 1].hist([df[df['abandoned']==0]['dayOfWeek'], df[df['abandoned']==1]['dayOfWeek']], 
                bins=7, label=['Completed', 'Abandoned'], alpha=0.7)
axes[0, 1].set_title('Day of Week Distribution')
axes[0, 1].set_xlabel('Day (1=Mon, 7=Sun)')
axes[0, 1].legend()

axes[0, 2].hist([df[df['abandoned']==0]['streakAtTime'], df[df['abandoned']==1]['streakAtTime']], 
                bins=20, label=['Completed', 'Abandoned'], alpha=0.7)
axes[0, 2].set_title('Streak Distribution')
axes[0, 2].set_xlabel('Streak Length')
axes[0, 2].legend()

axes[1, 0].hist([df[df['abandoned']==0]['failuresLast7Days'], df[df['abandoned']==1]['failuresLast7Days']], 
                bins=8, label=['Completed', 'Abandoned'], alpha=0.7)
axes[1, 0].set_title('Failures Last 7 Days')
axes[1, 0].set_xlabel('Number of Failures')
axes[1, 0].legend()

axes[1, 1].hist([df[df['abandoned']==0]['hoursFromReminder'], df[df['abandoned']==1]['hoursFromReminder']], 
                bins=12, label=['Completed', 'Abandoned'], alpha=0.7)
axes[1, 1].set_title('Hours From Reminder')
axes[1, 1].set_xlabel('Hours')
axes[1, 1].legend()

sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm', ax=axes[1, 2])
axes[1, 2].set_title('Feature Correlation')

plt.tight_layout()
plt.show()
print('\nData exploration complete!')

In [None]:
print('='*60)
print('TRAINING ML MODEL')
print('='*60)

feature_cols = ['hourOfDay', 'dayOfWeek', 'streakAtTime', 'failuresLast7Days', 'hoursFromReminder']
X = df[feature_cols].values
y = df['abandoned'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f'\nData split:')
print(f'   Training: {len(X_train)} samples')
print(f'   Testing: {len(X_test)} samples')

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'\nTraining Logistic Regression...')
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
lr_accuracy = accuracy_score(y_test, lr_pred)

print(f'Logistic Regression Accuracy: {lr_accuracy:.2%}')
print(f'\nClassification Report:')
print(classification_report(y_test, lr_pred, target_names=['Completed', 'Abandoned']))

print(f'\nTraining Keras Neural Network...')
keras_model = keras.Sequential([
    keras.layers.Input(shape=(5,)),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(8, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

keras_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = keras_model.fit(
    X_train_scaled, y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=0
)

keras_loss, keras_accuracy = keras_model.evaluate(X_test_scaled, y_test, verbose=0)
print(f'Keras Model Accuracy: {keras_accuracy:.2%}')
print(f'   Loss: {keras_loss:.4f}')

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()
print('\nTraining complete!')

In [None]:
print('='*60)
print('EXPORTING TO TFLITE')
print('='*60)

converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

with open('predictor.tflite', 'wb') as f:
    f.write(tflite_model)

tflite_size_mb = len(tflite_model) / (1024 * 1024)
print(f'TFLite model saved: predictor.tflite')
print(f'   Size: {tflite_size_mb:.2f} MB')

scaler_params = {
    'mean': scaler.mean_.tolist(),
    'scale': scaler.scale_.tolist()
}

with open('scaler_params.json', 'w') as f:
    json.dump(scaler_params, f, indent=2)

print(f'Scaler params saved: scaler_params.json')

df.to_csv('training_data.csv', index=False)
print(f'Training data saved: training_data.csv')

print('\n' + '='*60)
print('EXPORT COMPLETE!')
print('='*60)
print(f'\nOutput files:')
print(f'   1. predictor.tflite ({tflite_size_mb:.2f} MB)')
print(f'   2. scaler_params.json')
print(f'   3. training_data.csv (reference)')

In [None]:
from google.colab import files
print('Downloading files...')
files.download('predictor.tflite')
files.download('scaler_params.json')
files.download('training_data.csv')
print('Downloads initiated! Check your browser download folder.')

# Flutter Integration Instructions

## 1. Add files to your Flutter project

Place the downloaded files in:

```
assets/ml_models/
├── predictor.tflite
└── scaler_params.json
```

## 2. Update pubspec.yaml

```yaml
flutter:
  assets:
    - assets/ml_models/predictor.tflite
    - assets/ml_models/scaler_params.json
```

## 3. Add dependency

```yaml
dependencies:
  tflite_flutter: ^0.10.4
```

## 4. Implementation Example

```dart
import 'package:tflite_flutter/tflite_flutter.dart';
import 'dart:convert';
import 'package:flutter/services.dart';

class AbandonmentPredictor {
  Interpreter? _interpreter;
  Map<String, dynamic>? _scalerParams;

  Future<void> initialize() async {
    _interpreter = await Interpreter.fromAsset('assets/ml_models/predictor.tflite');
    final scalerJson = await rootBundle.loadString('assets/ml_models/scaler_params.json');
    _scalerParams = json.decode(scalerJson);
  }

  Future<double> predictAbandonmentRisk(Habit habit) async {
    final input = [
      [
        (habit.preferredTime?.hour ?? 12).toDouble(),
        DateTime.now().weekday.toDouble(),
        habit.currentStreak.toDouble(),
        habit.failuresLast7Days.toDouble(),
        habit.hoursSinceLastReminder.toDouble()
      ]
    ];

    final normalized = _normalize(input[0]);
    final output = List.filled(1, 0.0).reshape([1, 1]);
    _interpreter!.run([normalized], output);

    return output[0][0];
  }

  List<double> _normalize(List<double> features) {
    final mean = (_scalerParams!['mean'] as List).cast<double>();
    final scale = (_scalerParams!['scale'] as List).cast<double>();
    return List.generate(features.length, (i) => (features[i] - mean[i]) / scale[i]);
  }

  void dispose() {
    _interpreter?.close();
  }
}
```

## 5. Usage in BehavioralEngine

```dart
final predictor = AbandonmentPredictor();
await predictor.initialize();

final risk = await predictor.predictAbandonmentRisk(habit);

if (risk > 0.75) {
  // High risk - intervene!
  _reduceDifficulty(habit);
  _sendMotivationalNotification(habit);
} else if (risk > 0.50) {
  // Medium risk - gentle nudge
  _suggestOptimalTime(habit);
}
```

## 6. Next Steps

- Implement AbandonmentPredictor service
- Integrate with BehavioralEngine
- Add tests (6 realistic scenarios)
- In 2-3 weeks: Retrain with real user data
- Monthly retraining for continuous improvement

## 7. Retraining with Real Data (Future)

When you have ≥50 real user records in Firestore:

```bash
cd ml_pipeline
python export_firestore_data.py
python train_model.py
```

Then replace files in assets/ml_models/ and deploy new version.