# Sentio Model Training
#### (for testing outside application)

Training notebook for mental health text classification models.

**Available models:**
- `logistic_regression` - Fast baseline, good for quick iteration
- `random_forest` - Ensemble baseline
- `lstm` - Bidirectional LSTM neural network
- `transformer` - Custom transformer encoder (no pretrained weights)

## Set paths based on if running locally or in Google Colab

In [1]:
# Author: Marcus Berggren
import sys
from pathlib import Path
import nltk

nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    # If you are using Google colab, upload both 'sentio/data/' and 'sentio/ml_pipeline' folders to Colab Notebooks folder on Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/MyDrive/Colab Notebooks')
else:
    PROJECT_ROOT = Path.cwd().parent.parent

sys.path.insert(0, str(PROJECT_ROOT))
from ml_pipeline.preprocessing.preprocessor import DataPreprocessingPipeline
preprocessor = DataPreprocessingPipeline()

DATA_DIR = PROJECT_ROOT / 'data'
OUTPUT_DIR = PROJECT_ROOT / 'ml_pipeline' / 'sentio_results' / 'increment'

print(f"Environment: {'Colab' if IN_COLAB else 'Local'}")
print(f"Data dir: {DATA_DIR}")
print(f"Output dir: {OUTPUT_DIR}")

[nltk_data] Downloading package punkt_tab to /home/mbx/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /home/mbx/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mbx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mbx/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mbx/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Environment: Local
Data dir: /home/mbx/Documents/projects/gitlab/sentio/data
Output dir: /home/mbx/Documents/projects/gitlab/sentio/ml_pipeline/sentio_results/increment


In [None]:
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    force=True # Should enforce logs to show
)

## Configuration

**Change `MODEL_TYPE` to train different models.**

In [None]:
# MODEL SELECTION
# Options: 'logistic_regression', 'random_forest', 'lstm', 'transformer'
MODEL_TYPE = 'random_forest'

# TRAINING CONFIG
TEST_SIZE = 0.2
RANDOM_STATE = 42

# Model-specific configs
MODEL_CONFIGS = {
    'logistic_regression': {
        'max_iter': 1000,
        'C': 1.0,
    },
    'random_forest': {
        'n_estimators': 100,
        'max_depth': None,
        'tfidf': {
            'max_features': 10000,
        },
    },
    'lstm': {
        'embed_dim': 128,
        'hidden_dim': 128,
        'num_layers': 2,
        'dropout': 0.2,
        'epochs': 7,
        'patience':3,
        'batch_size': 64,
        'learning_rate': 1e-3,
        'max_seq_len': 256,
        'vocab_size': 30000,
    },
    'transformer': {
    'd_model': 256,
    'nhead': 8,
    'num_layers': 2,
    'dim_feedforward': 256,
    'dropout': 0.1,
    'epochs': 15,
    'patience': 3,
    'batch_size': 32,
    'learning_rate': 1e-5,
    'max_seq_len': 256,
    'vocab_size': 30000,
}
}

print(f'Model: {MODEL_TYPE}')
print(f'Output: {OUTPUT_DIR}')

## Load and Prepare Data

In [None]:
import pandas as pd

# Make sure you have the following csv files in 'sentio/data/' folder. In Django application data is taken from db.

df_train = pd.read_csv(f'{DATA_DIR}/sentio-data-train.csv')
X_train = df_train['text_preprocessed']
y_train = df_train['label']

df_test = pd.read_csv(f'{DATA_DIR}/sentio-data-test.csv')
X_test_fixed = df_test['text_preprocessed']
y_test_fixed = df_test['label']

df_incremental = pd.read_csv(f'{DATA_DIR}/sentio-data-increment.csv')
X_train_incremental = df_incremental['text_preprocessed']
y_train_incremental = df_incremental['label']


In [None]:
import numpy as np

print(f'Train: {len(X_train):,} samples')
print('Train label distribution:')
unique, counts = np.unique(y_train, return_counts=True)
for label, count in zip(unique, counts):
    print(f'  {label}: {count:,} ({count/len(y_train)*100:.1f}%)')

print()
print(f'Test: {len(X_test_fixed):,} samples')
print('Test label distribution:')
unique, counts = np.unique(y_test_fixed, return_counts=True)
for label, count in zip(unique, counts):
    print(f'  {label}: {count:,} ({count/len(y_train)*100:.1f}%)')

print()
print(f'Incremental: {len(X_train_incremental):,} samples')
print('Incremental label distribution:')
unique, counts = np.unique(y_train_incremental, return_counts=True)
for label, count in zip(unique, counts):
    print(f'  {label}: {count:,} ({count/len(y_train)*100:.1f}%)')

## Train Model

In [None]:
from ml_pipeline.training.trainer import ModelTrainer
from ml_pipeline.storage.handler import StorageHandler
from datetime import datetime

storage = StorageHandler(OUTPUT_DIR)
trainer = ModelTrainer(storage)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
job_id = f'{MODEL_TYPE}_{timestamp}'

print(f'Job ID: {job_id}')
print(f'Device: {trainer.device}')

In [None]:
config = MODEL_CONFIGS.get(MODEL_TYPE, {})
print(f'Config for {MODEL_TYPE}:')
for key, value in config.items():
    print(f'  {key}: {value}')

In [None]:
%%time
print(f'Training {MODEL_TYPE}...')

result = trainer.train(
    model_name=MODEL_TYPE,
    data=(X_train, y_train, X_test_fixed, y_test_fixed),
    config=config,
    job_id=job_id,
)

print('Training complete!')

## Results

In [None]:
print(f"Status: {result['status']}")
print(f"Model: {result['model_type']}")
print(f"Path: {result['model_path']}")
print("Metrics:")
print(f"  Accuracy:  {result['metrics']['accuracy']:.4f}")
print(f"  Precision: {result['metrics']['precision']:.4f}")
print(f"  Recall:    {result['metrics']['recall']:.4f}")
print(f"  F1 Score:  {result['metrics']['f1_score']:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os

cm = np.array(result['metrics']['confusion_matrix'])
labels = result['metrics']['confusion_matrix_labels']

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title(f'Confusion Matrix - {MODEL_TYPE}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()

cm_path = os.path.join(OUTPUT_DIR, f'{job_id}_confusion_matrix.png')
plt.savefig(cm_path, dpi=150)
plt.show()
print(f'Saved: {cm_path}')

In [None]:
print('Per-class metrics:')
report = result['metrics']['classification_report']
for label in labels:
    if label in report:
        m = report[label]
        print(f"{label:20} P: {m['precision']:.3f}  R: {m['recall']:.3f}  F1: {m['f1-score']:.3f}  Support: {m['support']}")

In [None]:
import json

results_path = os.path.join(OUTPUT_DIR, f'{job_id}_results.json')

save_results = {
    'job_id': job_id,
    'model_type': MODEL_TYPE,
    'model_path': result['model_path'],
    'config': config,
    'data': {
        'train_samples': len(X_train),
        'test_samples': len(X_test_fixed),
        'classes': list(set(y_train)),
    },
    'metrics': result['metrics'],
    'timestamp': timestamp,
}

with open(results_path, 'w') as f:
    json.dump(save_results, f, indent=2)

print(f'Saved: {results_path}')

In [None]:
# Google colab does not have swifter installed
if IN_COLAB:
    !pip install swifter

## Test Inference

In [None]:
from ml_pipeline.inference.predictor import Predictor

test_texts = [
    "I feel so stressed and anxious about everything lately.",
    "Life is great, I'm feeling happy and content.",
    "I don't want to live anymore, everything is hopeless.",
    "I can't stop crying and I feel so empty inside.",
    "I am happy and stressed"
]

# Load predictor
predictor = Predictor(storage)
predictor.load(result['model_path'])

print('Sample predictions:')
print('=' * 70)

for text in test_texts:
    # Preprocess text first (same as training)
    processed = preprocessor._preprocess_single_text(text)

    # Get prediction
    pred = predictor.predict(processed)

    print(f'\nText: "{text}"')
    print('-' * 70)

    # Sort probabilities descending
    sorted_probs = sorted(pred['probabilities'].items(), key=lambda x: -x[1])

    for i, (label, prob) in enumerate(sorted_probs):
        if i == 0:
            print(f'  >>> {label:20} {prob*100:5.1f}% <<<')
        else:
            print(f'      {label:20} {prob*100:5.1f}%')

print('\n' + '=' * 70)

## Summary

In [None]:
base_accuracy = result["metrics"]["accuracy"]

print('=' * 60)
print('TRAINING SUMMARY')
print('=' * 60)
print(f'Model:      {MODEL_TYPE}')
print(f'Job ID:     {job_id}')
print(f'Accuracy:   {base_accuracy:.4f}')
print(f'F1 Score:   {result["metrics"]["f1_score"]:.4f}')
print(f'Model Path: {result["model_path"]}')
print('=' * 60)
print(f'All outputs saved to: {OUTPUT_DIR}')


## Incremental training (fine-tune existing model)

In [None]:
%%time
TRAINING_MODE = 'incremental'

# Option 1: Use model from previous full training in this session
# Option 2: Specify a path to any existing model
USE_PREVIOUS_RESULT = True  # Set to False to use CUSTOM_MODEL_PATH

CUSTOM_MODEL_PATH = str(OUTPUT_DIR / 'transformer_transformer_20251130_003457.pt')  # Your best model

if USE_PREVIOUS_RESULT and result:
    BASE_MODEL_PATH = result['model_path']
    base_accuracy = result['metrics']['accuracy']
    print(f"Using model from this session: {BASE_MODEL_PATH}")
else:
    BASE_MODEL_PATH = CUSTOM_MODEL_PATH
    # Load base model metrics for comparison
    checkpoint = storage.load_neural_model(BASE_MODEL_PATH)
    print(f"Using custom model: {BASE_MODEL_PATH}")
    print("Note: Run evaluation on base model first to get base_accuracy for comparison")
    base_accuracy = None  # Will need to evaluate separately

print(f"Base model: {BASE_MODEL_PATH}")

# Config for incremental
incremental_config = {
    'training_mode': 'incremental',
    'base_model_path': BASE_MODEL_PATH,
    'learning_rate': 1e-5,
    'epochs': 5,
    'patience': 3,
    'batch_size': 32,
    'max_seq_len': 256,
    'expand_vocab': True,
}

print(f'Training {MODEL_TYPE}...')

# Train incrementally, important to use fixed test data
result = trainer.train(
    model_name=MODEL_TYPE, # Set in the config
    data=(X_train_incremental, y_train_incremental, X_test_fixed, y_test_fixed),
    config=incremental_config,
    job_id=f'transformer_incremental_{timestamp}',
)

print('Training complete!')

### Run same tests again but after incremental train

In [None]:
test_texts = [
    "I feel so stressed and anxious about everything lately.",
    "Life is great, I'm feeling happy and content.",
    "I don't want to live anymore, everything is hopeless.",
    "I can't stop crying and I feel so empty inside.",
    "I am happy and stressed"
]

# Load predictor
predictor = Predictor(storage)
predictor.load(result['model_path'])

print('Sample predictions:')
print('=' * 70)

for text in test_texts:
    # Preprocess text first (same as training)
    processed = preprocessor._preprocess_single_text(text)

    # Get prediction
    pred = predictor.predict(processed)

    print(f'\nText: "{text}"')
    print('-' * 70)

    # Sort probabilities descending
    sorted_probs = sorted(pred['probabilities'].items(), key=lambda x: -x[1])

    for i, (label, prob) in enumerate(sorted_probs):
        if i == 0:
            print(f'  >>> {label:20} {prob*100:5.1f}% <<<')
        else:
            print(f'      {label:20} {prob*100:5.1f}%')

print('\n' + '=' * 70)

In [None]:
print(f"Incremental model accuracy: {result['metrics']['accuracy']:.4f}")
print(f"Incremental model F1: {result['metrics']['f1_score']:.4f}")

In [None]:
new_accuracy = result['metrics']['accuracy']

if base_accuracy is not None:
    print(f"Base model:        {base_accuracy:.4f}")
    print(f"Incremental model: {new_accuracy:.4f}")
    print(f"Change:            {new_accuracy - base_accuracy:+.4f}")

    if new_accuracy > base_accuracy:
        print("Model improved!")
    else:
        print(f"Model did not improve with adding {len(X_train_incremental)} rows of data")
else:
    print(f"Incremental model: {new_accuracy:.4f}")
    print("(Base accuracy not available - evaluate base model separately to compare)")