<a href="https://colab.research.google.com/github/davis-j11-msdae/IE7374-Summer-2025-Group-5-Project/blob/main/colab_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/davis-j11-msdae/IE7374-Summer-2025-Group-5-Project.git

import os
curr=os.getcwd()
os.chdir(curr+'//IE7374-Summer-2025-Group-5-Project')
os.getcwd()

Cloning into 'IE7374-Summer-2025-Group-5-Project'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 34 (delta 2), reused 22 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (34/34), 46.19 KiB | 1.85 MiB/s, done.
Resolving deltas: 100% (2/2), done.


'/content/IE7374-Summer-2025-Group-5-Project'

In [2]:
!pip install dotenv
!pip install textstat
!pip install detoxify
!pip install deepspeed

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, dotenv
Successfully installed dotenv-0.9.9 python-dotenv-1.1.1
Collecting textstat
  Downloading textstat-0.7.7-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.0.33-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.7-py3-none-any.whl (175 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cmudict-1.0.33-py3-none-any.whl (939 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 

In [3]:
#!/usr/bin/env python3
"""
Generate user accounts for the storytelling system.
Creates 5 users for each age group with standard naming.
"""

import os
from pathlib import Path


def generate_users():
    """Generate user accounts file."""
    users_dir = Path("data/users")
    users_dir.mkdir(parents=True, exist_ok=True)

    users_file = users_dir / "users.txt"

    age_groups = {
        'child': [3, 4, 5, 5, 4],
        'kid': [7, 9, 11, 8, 10],
        'teen': [14, 16, 17, 15, 13],
        'adult': [25, 32, 28, 45, 38]
    }

    with open(users_file, 'w') as f:
        f.write("username,age,password\n")

        for group, ages in age_groups.items():
            for i, age in enumerate(ages, 1):
                username = f"{group}_{i}"
                f.write(f"{username},{age},test\n")

    print(f"Generated users file: {users_file}")
    print(f"Created 20 users (5 per age group)")


if __name__ == "__main__":
    generate_users()

Generated users file: data/users/users.txt
Created 20 users (5 per age group)


In [4]:
#!/usr/bin/env python3
"""
Main control script for the Personalized Storytelling System.
Provides a menu-driven interface for all system operations.
"""

import sys
from pathlib import Path
from utils.helpers import log_operation_status
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')

def main_menu():
    """Display main menu and handle user selection."""
    while True:
        print("\n" + "=" * 60)
        print("🎭 PERSONALIZED STORYTELLING SYSTEM")
        print("=" * 60)
        print("1. Check Environment")
        print("2. Download Data and Models")
        print("3. Process Raw Data")
        print("4. Tokenize Data")
        print("5. Evaluate Data")
        print("6. Train Model")
        print("7. Process Sample Stories")
        print("8. Interactive Story Creation")
        print("9. Generate Users File")
        print("10. Exit")
        print("=" * 60)

        choice = input("Select option (1-10): ").strip()

        if choice == "1":
            check_environment()
        elif choice == "2":
            download_data()
        elif choice == "3":
            process_data()
        elif choice == "4":
            tokenize_data()
        elif choice == "5":
            evaluate_data()
        elif choice == "6":
            train_model()
        elif choice == "7":
            process_samples()
        elif choice == "8":
            interactive_stories()
        elif choice == "9":
            generate_users()
        elif choice == "10":
            print("👋 Goodbye!")
            sys.exit(0)
        else:
            print("❌ Invalid choice. Please try again.")


def check_environment():
    """Check system environment and dependencies."""
    log_operation_status("Environment check")

    try:
        from src.environment_check import run_full_environment_check, print_environment_report

        print("\n🔍 Checking system environment...")
        results = run_full_environment_check()
        print_environment_report(results)

    except ImportError as e:
        print(f"❌ Error importing environment check: {e}")
    except Exception as e:
        print(f"❌ Environment check failed: {e}")

    input("\nPress Enter to continue...")


def download_data():
    """Download datasets and models."""
    log_operation_status("Data download")

    try:
        from src.download_data import main as download_main

        print("\n📦 Starting data and model download...")
        print("This will download:")
        print("  - Children's stories dataset from Kaggle")
        print("  - Sci-fi stories dataset from Kaggle")
        print("  - Mixtral 8x7B base model from Hugging Face")
        print("\nNote: This may take 30-60 minutes and requires ~50GB storage")

        confirm = input("\nProceed with download? (y/N): ").strip().lower()
        if confirm == 'y':
            download_main()
        else:
            print("Download cancelled.")

    except ImportError as e:
        print(f"❌ Error importing download module: {e}")
    except Exception as e:
        print(f"❌ Download failed: {e}")

    input("\nPress Enter to continue...")


def process_data():
    """Process raw data files."""
    log_operation_status("Data processing")

    try:
        from src.data_loader import main as loader_main

        print("\n⚙️ Processing raw data files...")
        print("This will:")
        print("  - Extract stories from downloaded text files")
        print("  - Clean and filter content")
        print("  - Assign age groups to stories")
        print("  - Save processed datasets")

        loader_main()

    except ImportError as e:
        print(f"❌ Error importing data loader: {e}")
    except Exception as e:
        print(f"❌ Data processing failed: {e}")

    input("\nPress Enter to continue...")


def tokenize_data():
    """Tokenize processed datasets."""
    log_operation_status("Data tokenization")

    try:
        from src.data_tokenizer import main as tokenizer_main

        print("\n🔤 Tokenizing processed datasets...")
        print("This will:")
        print("  - Load processed story datasets")
        print("  - Format stories with age-appropriate instructions")
        print("  - Tokenize using Mixtral tokenizer")
        print("  - Create train/validation/test splits")

        tokenizer_main()

    except ImportError as e:
        print(f"❌ Error importing tokenizer: {e}")
    except Exception as e:
        print(f"❌ Tokenization failed: {e}")

    input("\nPress Enter to continue...")


def evaluate_data():
    """Evaluate processed datasets."""
    log_operation_status("Data evaluation")

    try:
        from src.eval import main as eval_main

        print("\n📊 Evaluating processed datasets...")
        print("This will:")
        print("  - Analyze text quality (grammar, coherence)")
        print("  - Calculate readability scores")
        print("  - Check content safety (toxicity)")
        print("  - Generate evaluation statistics")
        print("\nNote: Requires OpenAI API key for grammar/coherence evaluation")

        eval_main()

    except ImportError as e:
        print(f"❌ Error importing evaluation module: {e}")
    except Exception as e:
        print(f"❌ Evaluation failed: {e}")

    input("\nPress Enter to continue...")


def train_model():
    """Train the storytelling model."""
    log_operation_status("Model training")

    try:
        from src.train import main as train_main

        print("\n🚀 Training storytelling model...")
        print("This will:")
        print("  - Load Mixtral 8x7B base model")
        print("  - Fine-tune on processed story datasets")
        print("  - Use DeepSpeed for memory optimization")
        print("  - Save fine-tuned model")
        print("\nNote: Requires significant GPU memory and time (1-3 hours)")

        confirm = input("\nProceed with training? (y/N): ").strip().lower()
        if confirm == 'y':
            train_main()
        else:
            print("Training cancelled.")

    except ImportError as e:
        print(f"❌ Error importing training module: {e}")
    except Exception as e:
        print(f"❌ Training failed: {e}")

    input("\nPress Enter to continue...")


def process_samples():
    """Process sample stories."""
    log_operation_status("Sample processing")

    try:
        from src.samples import main as samples_main

        print("\n📝 Processing sample stories...")
        print("This will:")
        print("  - Generate stories for 10 sample prompts")
        print("  - Test all age groups (child, kid, teen, adult)")
        print("  - Include story continuation examples")
        print("  - Save stories to user history")
        print("  - Generate comprehensive evaluation report")

        samples_main()

    except ImportError as e:
        print(f"❌ Error importing samples module: {e}")
    except Exception as e:
        print(f"❌ Sample processing failed: {e}")

    input("\nPress Enter to continue...")


def interactive_stories():
    """Run interactive story creation."""
    log_operation_status("Interactive story session")

    try:
        from src.model_runner import main as runner_main

        print("\n🎭 Starting interactive story creation...")
        print("This will:")
        print("  - Authenticate user credentials")
        print("  - Generate personalized stories")
        print("  - Support story continuation")
        print("  - Manage story history")

        runner_main()

    except ImportError as e:
        print(f"❌ Error importing model runner: {e}")
    except Exception as e:
        print(f"❌ Interactive session failed: {e}")

    input("\nPress Enter to continue...")


def generate_users():
    """Generate users file."""
    log_operation_status("User generation")

    try:
        from generate_users import generate_users

        print("\n👥 Generating users file...")
        print("This will create 20 users (5 per age group) with credentials:")
        print("  - Usernames: child_1, kid_1, teen_1, adult_1, etc.")
        print("  - Password: 'test' for all users")
        print("  - Ages distributed across age groups")

        generate_users()

    except ImportError as e:
        print(f"❌ Error importing user generator: {e}")
    except Exception as e:
        print(f"❌ User generation failed: {e}")

    input("\nPress Enter to continue...")


def setup_directories():
    """Create required directory structure."""
    from utils.helpers import ensure_dir_exists, load_config

    try:
        config = load_config()
        paths = config['paths']

        directories = [
            paths['data_root'],
            paths['data_raw'],
            paths['data_processed'],
            paths['data_tokenized'],
            paths['data_evaluated'],
            paths['models'],
            paths['outputs'],
            paths['user_history'],
            paths['samples'],
            paths['users'],
            "logs"
        ]

        for directory in directories:
            ensure_dir_exists(directory)

    except Exception as e:
        print(f"⚠️ Warning: Could not create directories: {e}")


def display_welcome():
    """Display welcome message and system information."""
    print("🎭 PERSONALIZED STORYTELLING SYSTEM")
    print("=" * 60)
    print("An AI-powered storytelling system using Mixtral 8x7B")
    print("Features:")
    print("  • Age-appropriate story generation (child, kid, teen, adult)")
    print("  • Story history and continuation")
    print("  • Quality evaluation and safety filtering")
    print("  • Interactive user sessions")
    print("=" * 60)


def main():
    """Main entry point."""
    # Setup environment
    setup_directories()

    # Display welcome message
    display_welcome()

    # Check if this is first run
    config_file = Path("configs/model_config.yaml")
    users_file = Path("data/users/users.txt")

    if not config_file.exists():
        print("\n⚠️ Configuration file not found!")
        print("Please ensure configs/model_config.yaml exists.")
        return

    if not users_file.exists():
        print("\n💡 Users file not found. Run option 9 to generate users first.")

    # Start main menu loop
    try:
        main_menu()
    except KeyboardInterrupt:
        print("\n\n👋 System interrupted by user. Goodbye!")
    except Exception as e:
        print(f"\n❌ Unexpected error: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()

🎭 PERSONALIZED STORYTELLING SYSTEM
An AI-powered storytelling system using Mixtral 8x7B
Features:
  • Age-appropriate story generation (child, kid, teen, adult)
  • Story history and continuation
  • Quality evaluation and safety filtering
  • Interactive user sessions

🎭 PERSONALIZED STORYTELLING SYSTEM
1. Check Environment
2. Download Data and Models
3. Process Raw Data
4. Tokenize Data
5. Evaluate Data
6. Train Model
7. Process Sample Stories
8. Interactive Story Creation
9. Generate Users File
10. Exit
Select option (1-10): 1
[20:32:10] Environment check started

🔍 Checking system environment...
[20:32:11] Environment check started
[2025-07-18 20:32:14,658] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-18 20:32:19,104] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False

ENVIRONMENT CHECK REPORT

🔑 Environment Variables:
  ✅ OPENAI_API_KEY
  ✅ HF_TOKEN
  ✅ KAGGLE_USERNAME


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

  ✅ Tokenizer downloaded


config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/92.7k [00:00<?, ?B/s]

Fetching 19 files:   0%|          | 0/19 [00:00<?, ?it/s]

model-00008-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00019.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00007-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00004-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00005-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00009-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00010-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00011-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00012-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00013-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00014-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00015-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00016-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00017-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00018-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00019-of-00019.safetensors:   0%|          | 0.00/4.22G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

  ✅ Model downloaded
[20:48:28] Base model download completed

📋 Download Summary:
  ✅ dataset_children_stories
  ✅ dataset_scifi_stories
  ✅ base_model

✅ All downloads completed successfully!
[20:48:28] Data and model download completed

Press Enter to continue...

🎭 PERSONALIZED STORYTELLING SYSTEM
1. Check Environment
2. Download Data and Models
3. Process Raw Data
4. Tokenize Data
5. Evaluate Data
6. Train Model
7. Process Sample Stories
8. Interactive Story Creation
9. Generate Users File
10. Exit
Select option (1-10): 3
[20:48:41] Data processing started

⚙️ Processing raw data files...
This will:
  - Extract stories from downloaded text files
  - Clean and filter content
  - Assign age groups to stories
  - Save processed datasets
[20:48:41] Data loading and processing started

Processed datasets cache found at data/processed. Overwrite? (y/N): y
[20:48:52] Raw data processing started
[20:48:52] Loading children_stories started
  ✅ Loaded cleaned_merged_fairy_tales_without_eos.

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
