In [1]:
import sys
import os

# Get the current notebook's directory
notebook_dir = os.path.dirname(os.path.abspath('.'))

# Add to Python path
if notebook_dir not in sys.path:
    sys.path.append(notebook_dir)

In [2]:
from data_management.data_manager import DataManager
import json
import glob
import datetime

# Create Compeition Collections

In [15]:
def import_competitions_from_json(file_path: str):
    """Import competitions from JSON file using the new class structure"""
    
    # Initialize data manager
    with DataManager() as dm:
        # Load competitions data
        with open(file_path, 'r') as file:
            competitions_data = json.load(file)

        success_count = 0
        for competition_id, competition_info in competitions_data.items():
            # Prepare competition record
            competition_record = {
                'competition_id': competition_id,
                'title': competition_info.get('Title', competition_id.title()),
                'description': competition_info.get('Description', ''),
                'evaluation': competition_info.get('Evaluation', ''),
                'competition_host': competition_info.get('Competition Host', []),
                'price_award': competition_info.get('Prizes & Awards', []),
                'entrants': competition_info.get('Entrants', 0),
                'participants': competition_info.get('Participants', 0),
                'teams': competition_info.get('Teams', 0),
                'submissions': competition_info.get('Submissions', 0),
                'tags': competition_info.get('Tags', []),
                'competition_url': competition_info.get('competition_url', ''),
                'last_updated': datetime.datetime.now()  # Automatically set
            }
            
            # Add data description if available
            if 'data' in competition_info:
                data_info = competition_info['data']
                competition_record.update({
                    'data_description': data_info.get('Description', ''),
                    'data_files_num': data_info.get('Files', ''),
                    'data_size': data_info.get('Size', ''),
                    'data_type': data_info.get('Type', '')
                })
            
            # Use the CompetitionManager to create/update
            try:
                result = dm.competitions.create_or_update(competition_record)
                if result:
                    success_count += 1
                    print(f"Successfully imported competition: {competition_id}")
                else:
                    print(f"Failed to import competition: {competition_id}")
            except Exception as e:
                print(f"Error importing competition {competition_id}: {str(e)}")

        print(f"\nSummary: Successfully imported {success_count} of {len(competitions_data)} competitions")

# Example usage
if __name__ == "__main__":
    import_competitions_from_json(
        '/Users/zhongming/Local Docs/Github/Kaggle_RAG_dataset/data/competitions_metadata.json'
    )

Successfully imported competition: titanic

Summary: Successfully imported 1 of 1 competitions


# Create dataset collection for train and test set of competition

In [3]:
def import_datasets_from_directory(base_dir: str):
    """
    Import datasets from CSV files in competition directories using the new class structure
    
    Args:
        base_dir: Base directory containing competition folders with CSV files
    """
    # Initialize data manager
    with DataManager() as dm:
        # Get all competition directories
        competition_dirs = [d for d in os.listdir(base_dir) 
                          if os.path.isdir(os.path.join(base_dir, d))]

        total_datasets = 0
        total_imported = 0

        # Process each competition directory
        for competition_id in competition_dirs:
            competition_dir = os.path.join(base_dir, competition_id)
            
            # Check if competition exists using CompetitionManager
            if not dm.competitions.exists(competition_id):
                print(f"Competition {competition_id} does not exist in the database, skipping...")
                continue
            
            # Find all CSV files in the competition directory
            csv_pattern = os.path.join(competition_dir, "*.csv")
            csv_files = glob.glob(csv_pattern)
            
            if not csv_files:
                print(f"No CSV files found for competition {competition_id}")
                continue
            
            print(f"\nProcessing competition: {competition_id}")
            print(f"Found {len(csv_files)} CSV files")
            
            # Import each CSV file
            imported_count = 0
            for csv_file in csv_files:
                file_name = os.path.basename(csv_file)
                base_name = os.path.splitext(file_name)[0]
                
                # Determine dataset type based on filename
                dataset_type = 'unknown'
                if 'train' in base_name.lower():
                    dataset_type = 'train'
                elif 'test' in base_name.lower():
                    dataset_type = 'test'
                elif 'submission' in base_name.lower() or 'submission' in base_name.lower():
                    dataset_type = 'sample_submission'
                
                print(f"Importing {file_name} as {dataset_type} dataset...")
                
                # Use DatasetManager to import
                try:
                    if dm.datasets.import_csv(csv_file, competition_id, dataset_type):
                        imported_count += 1
                        print(f"Successfully imported {file_name}")
                    else:
                        print(f"Failed to import {file_name}")
                except Exception as e:
                    print(f"Error importing {file_name}: {str(e)}")
            
            print(f"Imported {imported_count} of {len(csv_files)} datasets for competition {competition_id}")
            
            total_datasets += len(csv_files)
            total_imported += imported_count

        print(f"\nFinal Summary:")
        print(f"Total datasets found: {total_datasets}")
        print(f"Total datasets imported: {total_imported}")
        print(f"Success rate: {(total_imported/total_datasets)*100:.2f}%")

# Example usage
if __name__ == "__main__":
    base_directory = '/Users/zhongming/Local Docs/Github/Kaggle_RAG_dataset/data/dataset'
    import_datasets_from_directory(base_directory)


Processing competition: titanic
Found 3 CSV files
Importing test.csv as test dataset...
Successfully imported test.csv
Importing train.csv as train dataset...
Successfully imported train.csv
Importing gender_submission.csv as sample_submission dataset...
Successfully imported gender_submission.csv
Imported 3 of 3 datasets for competition titanic

Final Summary:
Total datasets found: 3
Total datasets imported: 3
Success rate: 100.00%


# Create notebook collections

In [7]:
import os
import json
import datetime
from typing import Dict

def import_notebooks_from_directory(base_dir: str):
    """
    Import notebooks from competition directories using DataManager and NotebookManager
    Args:
        base_dir: Base directory containing competition folders
    """
    with DataManager() as dm:

        competition_dirs = [d for d in os.listdir(base_dir) 
                          if os.path.isdir(os.path.join(base_dir, d))]

        for competition_id in competition_dirs:
            notebook_folder_path = os.path.join(base_dir, competition_id)
            metadata_file = os.path.join(notebook_folder_path, "metadata", "all_notebooks_metadata.json")
            
            if not os.path.exists(metadata_file):
                print(f"⚠️ No metadata found for {competition_id}")
                continue

            with open(metadata_file, 'r') as f:
                metadata_dict = json.load(f)

            for url, notebook_metadata in metadata_dict.items():
                notebook_name = notebook_metadata.get('notebook_name')
                if not notebook_name:
                    print(f"⚠️ Missing notebook_name in {url}")
                    continue

                notebook_file = os.path.join(notebook_folder_path, f"{notebook_name}.ipynb")
                if not os.path.isfile(notebook_file):
                    print(f"⚠️ File not found: {notebook_name}.ipynb")
                    continue

                # Convert date string to datetime
                try:
                    input_str.split(" (")[0]
                    created_at = datetime.datetime.strptime(
                        notebook_metadata['date_created'].split(" (")[0],
                        "%a %b %d %Y %H:%M:%S GMT%z"
                    ).astimezone(datetime.timezone.utc)
                except (KeyError, ValueError):
                    created_at = None

                # Prepare metrics (as expected by NotebookManager)
                metrics = {
                    'score': float(notebook_metadata.get('score', 0)),
                    'votes': int(notebook_metadata.get('votes', 0)),
                    'comments': int(notebook_metadata.get('comments', 0))
                }

                # Prepare metadata
                metadata = {
                    'url': url,
                    'created_at': created_at,
                    'downloaded': notebook_metadata.get('downloaded', False)
                }

                # Import with error handling using NotebookManager's method
                try:
                    success = dm.notebooks.import_from_file(
                        file_path=notebook_file,
                        competition_id=competition_id,
                        metrics=metrics,
                        **metadata
                    )
                    
                    if success:
                        print(f"✓ Processed {notebook_name}")
                    else:
                        print(f"✗ Failed to import {notebook_name}")
                except Exception as e:
                    print(f"✗ Failed {notebook_name}: {str(e)}")

# Example usage
if __name__ == "__main__":
    notebooks_base_dir = '/Users/zhongming/Local Docs/Github/Kaggle_RAG_dataset/data/notebooks'
    import_notebooks_from_directory(notebooks_base_dir)

✓ Processed vladimirsydor_add-leak
✓ Processed yunishi0716_best-weight-searching3
✓ Processed aleksthegreat_public-blend
✓ Processed yamsam_ashrae-leak-validation-and-more
✓ Processed wuliaokaola_ashrae-maybe-this-can-make-public-lb-some-useful
✓ Processed vladimirsydor_bland-lgbm-on-leaks
✓ Processed vladimirsydor_bland-by-leak
✓ Processed rohanrao_ashrae-divide-and-conquer
✓ Processed teeyee314_best-single-half-half-lgbm-1-07
✓ Processed vladimirsydor_bland-lgbm-folds
✓ Processed mimoudata_ashrae-2-lightgbm-without-leak-data
✓ Processed aitude_ashrae-kfold-lightgbm-without-leak-1-08
✓ Processed purist1024_ashrae-simple-data-cleanup-lb-1-08-no-leaks
✓ Processed ragnar123_another-1-08-lb-no-leak
✓ Processed mimoudata_ashrae-lightgbm-without-leak
✓ Processed yunishi0716_k-folds-model
✓ Processed hmendonca_4-ashrae-blended
✓ Processed grapestone5321_ashrae-stacking-method
✓ Processed mimoudata_ashrae-lightgbm-without-leak-data
✓ Processed iwatatakuya_ashrae-kfold-lightgbm-without-buildin

# Create User Profile Collection

In [8]:
with DataManager() as dm:
    # Create a user
    user_id = dm.users.create(
        username='data_scientist',
        email='ds@example.com',
        experience_level='intermediate'
    )

# Create History Tracking Collection

In [9]:
user_id = 'cf1f30a8-e334-4674-a860-1386963a0b50'

In [10]:

import random
import os

def create_sample_submission_file(round_num: int, score: float) -> str:
    """Helper to create temporary submission files for demo purposes"""
    filename = f"temp_submission_round_{round_num}.csv"
    # Create a simple CSV file that would pass validation
    with open(filename, 'w') as f:
        f.write("PassengerId,Survived\n")
        for i in range(892, 1310):
            f.write(f"{i},{random.random()}\n")
    return filename

def simulate_user_workflow(user_id: str, competition_id: str = 'titanic'):
    """
    Simulates a complete user interaction workflow with the competition system
    using DataManager's prepare_submission method
    """
    with DataManager() as dm:
        # 1. Set up user and competition
        print(f"\n=== Setting up user {user_id} for competition {competition_id} ===")
        dm.users.set_active_competition(user_id, competition_id)
        dm.history.initialize_history(user_id, competition_id)
        
        # 2. Initial interactions with chatbot
        print("\n=== Phase 1: Initial Chatbot Interactions ===")
        dm.history.log_interaction(user_id, "Hi, I want to participate in the Titanic competition", is_user=True)
        dm.history.log_interaction(user_id, "Welcome! Let me help you get started with some basic concepts.", is_user=False)
        dm.history.log_interaction(user_id, "What's the first step I should take?", is_user=True)
        
        # 3. Chatbot sends first notebook (based on quality scores)
        print("\n=== Phase 2: First Notebook Recommendation ===")
        notebook = dm.notebooks.get_by_score(competition_id, 'quality', min_score=0.8, limit=1)[0]
        dm.history.log_interaction(
            user_id, 
            f"Here's a high-quality notebook to get you started: {notebook['notebook_id']} (Score: {notebook['scores']['quality']:.2f})", 
            is_user=False
        )
        
        # 4. User digests and submits first submission
        print("\n=== Phase 3: First Submission ===")
        dm.history.log_interaction(user_id, "I've reviewed the notebook and ready to submit!", is_user=True)
        
        # Create a sample submission file
        submission_file = create_sample_submission_file(1, 0.0)
        first_score = round(random.uniform(0.7, 0.85), 4)
        
        # Use prepare_submission which handles validation and proper submission logging
        result = dm.prepare_submission(
            user_id=user_id,
            submission_file=submission_file,
            message="First submission based on recommended notebook"
        )
        
        # Clean up temporary file
        os.remove(submission_file)
        
        if result['status'] != 'success':
            print(f"First submission failed: {result['message']}")
            return
        
        print(f"First submission result: {result}")
        
        # 5-6. Continue interactions and improvements through 6 rounds
        current_round = 2
        while current_round <= 6:
            print(f"\n=== Round {current_round}: Improvement Cycle ===")
            
            # Chatbot interaction
            dm.history.log_interaction(
                user_id, 
                f"What should I focus on to improve beyond my current score?", 
                is_user=True
            )
            
            # Chatbot recommends another notebook
            notebook = dm.notebooks.get_by_score(
                competition_id, 
                'quality', 
                min_score=0.8 + (current_round * 0.02),
                limit=1
            )[0]
            dm.history.log_interaction(
                user_id, 
                f"Check out this notebook for advanced techniques: {notebook['notebook_id']}", 
                is_user=False
            )
            
            # Create improved submission file
            submission_file = create_sample_submission_file(current_round, 0.0)
            
            # Use prepare_submission for proper handling
            result = dm.prepare_submission(
                user_id=user_id,
                submission_file=submission_file,
                message=f"Improved submission after round {current_round}"
            )
            
            # Clean up temporary file
            os.remove(submission_file)
            
            if result['status'] != 'success':
                print(f"Round {current_round} submission failed: {result['message']}")
                continue
            
            print(f"Round {current_round} submission result: {result}")
            current_round += 1
        
        # 7. Complete the competition
        print("\n=== Final Phase: Competition Completion ===")
        final_score = result.get('score', 0)
        dm.history.complete_competition(user_id, final_score=final_score, notes="Completed all 6 rounds")
        dm.users.clear_active_competition(user_id)
        
        # Get final history
        history = dm.history.get_history(user_id)
        print("\n=== Competition Summary ===")
        print(f"User: {user_id}")
        print(f"Competition: {history['competition_id']}")
        print(f"Final Score: {final_score}")
        print(f"Total Submissions: {len(history['submission_history'])}")
        print("Round Scores:")
        for i in range(1, 7):
            round_data = history['round_history'].get(f'round_{i}', {})
            print(f"  Round {i}: Best Score = {round_data.get('best_score')}")

# Example usage
if __name__ == "__main__":
    # Create a test user
    with DataManager() as dm:
        user_id = dm.users.create("test_user", "test@example.com")
    
    # Run the simulation
    simulate_user_workflow(user_id)


=== Setting up user cc8216f9-dd78-4e73-a707-7c8def40d666 for competition titanic ===

=== Phase 1: Initial Chatbot Interactions ===

=== Phase 2: First Notebook Recommendation ===


IndexError: list index out of range

In [None]:
with DataManager() as dm:
    # Set active competition
    dm.users.set_active_competition(user_id, 'titanic')
    dm.history.initialize_history(user_id, 'titanic')

    # Log some interactions
    dm.history.log_interaction(user_id, "Starting Titanic analysis", is_user=True)
    dm.history.log_interaction(user_id, "Here's some initial guidance", is_user=False)

    # Log a submission
    dm.history.log_submission(user_id, {
        'notebook_id': 'initial_analysis',
        'score': 0.85,
        'notes': 'First submission with basic model'
    })
    
    # advance the user
    dm.history.advance_user(user_id, 'titanic', 'advanced')
    # Log a new interaction
    dm.history.log_interaction(user_id, "Advanced analysis on Titanic dataset", is_user=True)
    # Log a new submission
    dm.history.log_submission(user_id, {
        'notebook_id': 'advanced_analysis',
        'score': 0.90,
        'notes': 'Improved model with feature engineering'
    })
    #complete the history
    dm.history.complete_competition(user_id, 'titanic')
    
    # exit the competition
    dm.users.clear_active_competition(user_id)