In [1]:
import os
from data_manager import DataManager
import json
import glob

# Create Compeition Collections

In [None]:
def import_competitions_from_json(file_path: str):
    """Import competitions from JSON file using the new class structure"""
    
    # Initialize data manager
    with DataManager() as dm:
        # Load competitions data
        with open(file_path, 'r') as file:
            competitions_data = json.load(file)

        success_count = 0
        for competition_id, competition_info in competitions_data.items():
            # Prepare competition record
            competition_record = {
                'competition_id': competition_id,
                'title': competition_info.get('Title', competition_id.title()),
                'description': competition_info.get('Description', ''),
                'evaluation': competition_info.get('Evaluation', ''),
                'competition_host': competition_info.get('Competition Host', []),
                'price_award': competition_info.get('Prizes & Awards', []),
                'entrants': competition_info.get('Entrants', 0),
                'participants': competition_info.get('Participants', 0),
                'teams': competition_info.get('Teams', 0),
                'submissions': competition_info.get('Submissions', 0),
                'tags': competition_info.get('Tags', []),
                'competition_url': competition_info.get('competition_url', ''),
                'last_updated': datetime.datetime.now()  # Automatically set
            }
            
            # Add data description if available
            if 'data' in competition_info:
                data_info = competition_info['data']
                competition_record.update({
                    'data_description': data_info.get('Description', ''),
                    'data_files_num': data_info.get('Files', ''),
                    'data_size': data_info.get('Size', ''),
                    'data_type': data_info.get('Type', '')
                })
            
            # Use the CompetitionManager to create/update
            try:
                result = dm.competitions.create_or_update(competition_record)
                if result:
                    success_count += 1
                    print(f"Successfully imported competition: {competition_id}")
                else:
                    print(f"Failed to import competition: {competition_id}")
            except Exception as e:
                print(f"Error importing competition {competition_id}: {str(e)}")

        print(f"\nSummary: Successfully imported {success_count} of {len(competitions_data)} competitions")

# Example usage
if __name__ == "__main__":
    import_competitions_from_json(
        '/Users/zhongming/Local Docs/Github/Kaggle_RAG_dataset/data/competitions_metadata.json'
    )

Successfully imported competition: titanic

Summary: Successfully imported 1 of 1 competitions


# Create dataset collection for train and test set of competition

In [None]:
def import_datasets_from_directory(base_dir: str):
    """
    Import datasets from CSV files in competition directories using the new class structure
    
    Args:
        base_dir: Base directory containing competition folders with CSV files
    """
    # Initialize data manager
    with DataManager() as dm:
        # Get all competition directories
        competition_dirs = [d for d in os.listdir(base_dir) 
                          if os.path.isdir(os.path.join(base_dir, d))]

        total_datasets = 0
        total_imported = 0

        # Process each competition directory
        for competition_id in competition_dirs:
            competition_dir = os.path.join(base_dir, competition_id)
            
            # Check if competition exists using CompetitionManager
            if not dm.competitions.exists(competition_id):
                print(f"Competition {competition_id} does not exist in the database, skipping...")
                continue
            
            # Find all CSV files in the competition directory
            csv_pattern = os.path.join(competition_dir, "*.csv")
            csv_files = glob.glob(csv_pattern)
            
            if not csv_files:
                print(f"No CSV files found for competition {competition_id}")
                continue
            
            print(f"\nProcessing competition: {competition_id}")
            print(f"Found {len(csv_files)} CSV files")
            
            # Import each CSV file
            imported_count = 0
            for csv_file in csv_files:
                file_name = os.path.basename(csv_file)
                base_name = os.path.splitext(file_name)[0]
                
                # Determine dataset type based on filename
                dataset_type = 'unknown'
                if 'train' in base_name.lower():
                    dataset_type = 'train'
                elif 'test' in base_name.lower():
                    dataset_type = 'test'
                elif 'submission' in base_name.lower() or 'submission' in base_name.lower():
                    dataset_type = 'sample_submission'
                
                print(f"Importing {file_name} as {dataset_type} dataset...")
                
                # Use DatasetManager to import
                try:
                    if dm.datasets.import_csv(csv_file, competition_id, dataset_type):
                        imported_count += 1
                        print(f"Successfully imported {file_name}")
                    else:
                        print(f"Failed to import {file_name}")
                except Exception as e:
                    print(f"Error importing {file_name}: {str(e)}")
            
            print(f"Imported {imported_count} of {len(csv_files)} datasets for competition {competition_id}")
            
            total_datasets += len(csv_files)
            total_imported += imported_count

        print(f"\nFinal Summary:")
        print(f"Total datasets found: {total_datasets}")
        print(f"Total datasets imported: {total_imported}")
        print(f"Success rate: {(total_imported/total_datasets)*100:.2f}%")

# Example usage
if __name__ == "__main__":
    base_directory = '/Users/zhongming/Local Docs/Github/Kaggle_RAG_dataset/data/dataset'
    import_datasets_from_directory(base_directory)


Processing competition: titanic
Found 3 CSV files
Importing test.csv as test dataset...
Successfully imported test.csv
Importing train.csv as train dataset...
Successfully imported train.csv
Importing gender_submission.csv as unknown dataset...
Successfully imported gender_submission.csv
Imported 3 of 3 datasets for competition titanic

Final Summary:
Total datasets found: 3
Total datasets imported: 3
Success rate: 100.00%


# Create notebook collections

In [None]:
def import_notebooks_from_directory(base_dir: str):
    """
    Import notebooks from competition directories using the new class structure
    
    Args:
        base_dir: Base directory containing competition folders with notebook files
    """
    # Initialize data manager
    with DataManager() as dm:
        # Get all competition directories
        competition_dirs = [d for d in os.listdir(base_dir) 
                          if os.path.isdir(os.path.join(base_dir, d))]

        total_notebooks = 0
        total_imported = 0

        # Process each competition directory
        for competition_id in competition_dirs:
            notebook_folder_path = os.path.join(base_dir, competition_id)
            
            # Check if competition exists using CompetitionManager
            if not dm.competitions.exists(competition_id):
                print(f"Competition {competition_id} does not exist in the database, skipping...")
                continue
            
            # Define paths for notebooks and metadata
            notebooks_dir = notebook_folder_path
            metadata_dir = os.path.join(notebook_folder_path, "metadata")
            metadata_file = os.path.join(metadata_dir, "all_notebooks_metadata.json")
            
            # Check if required directories exist
            if not os.path.exists(notebooks_dir):
                print(f"No notebooks directory found for competition {competition_id}")
                continue
                
            if not os.path.exists(metadata_file):
                print(f"No metadata file found for competition {competition_id}")
                continue
            
            print(f"\nProcessing competition: {competition_id}")
            
            # Load metadata
            try:
                with open(metadata_file, 'r') as f:
                    metadata_dict = json.load(f)
            except Exception as e:
                print(f"Error loading metadata for {competition_id}: {str(e)}")
                continue
            
            success_count = 0
            processed_count = 0
            
            # Process each notebook in metadata
            for url, metadata in metadata_dict.items():
                notebook_name = metadata.get('notebook_name')
                if not notebook_name:
                    print(f"Skipping entry with missing notebook_name: {url}")
                    continue

                # Construct notebook file path
                notebook_file = os.path.join(notebooks_dir, f"{notebook_name}.ipynb")
                
                if not os.path.isfile(notebook_file):
                    print(f"Notebook file not found: {notebook_file}")
                    continue
                
                # Add URL back to metadata
                metadata['url'] = url
                
                # Import notebook using NotebookManager
                try:
                    if dm.notebooks.import_from_file(notebook_file, competition_id, metadata):
                        success_count += 1
                        print(f"✓ Imported notebook: {notebook_name}")
                    else:
                        print(f"✗ Failed to import notebook: {notebook_name}")
                except Exception as e:
                    print(f"Error importing notebook {notebook_name}: {str(e)}")
                
                processed_count += 1
            
            print(f"Imported {success_count} of {processed_count} notebooks for {competition_id}")
            
            total_notebooks += processed_count
            total_imported += success_count

        # Print final summary
        print(f"\nFinal Summary:")
        print(f"Total notebooks processed: {total_notebooks}")
        print(f"Total notebooks imported: {total_imported}")
        print(f"Success rate: {(total_imported/total_notebooks)*100:.2f}%" if total_notebooks > 0 else "No notebooks processed")

# Example usage
if __name__ == "__main__":
    notebooks_base_dir = '/Users/zhongming/Local Docs/Github/Kaggle_RAG_dataset/data/notebooks'
    import_notebooks_from_directory(notebooks_base_dir)


Processing competition: ashrae-energy-prediction
✓ Imported notebook: vladimirsydor_add-leak
✓ Imported notebook: yunishi0716_best-weight-searching3
✓ Imported notebook: aleksthegreat_public-blend
✓ Imported notebook: yamsam_ashrae-leak-validation-and-more
✓ Imported notebook: wuliaokaola_ashrae-maybe-this-can-make-public-lb-some-useful
✓ Imported notebook: vladimirsydor_bland-lgbm-on-leaks
✓ Imported notebook: vladimirsydor_bland-by-leak
✓ Imported notebook: rohanrao_ashrae-divide-and-conquer
✓ Imported notebook: teeyee314_best-single-half-half-lgbm-1-07
✓ Imported notebook: vladimirsydor_bland-lgbm-folds
✓ Imported notebook: mimoudata_ashrae-2-lightgbm-without-leak-data
✓ Imported notebook: aitude_ashrae-kfold-lightgbm-without-leak-1-08
✓ Imported notebook: purist1024_ashrae-simple-data-cleanup-lb-1-08-no-leaks
✓ Imported notebook: ragnar123_another-1-08-lb-no-leak
✓ Imported notebook: mimoudata_ashrae-lightgbm-without-leak
✓ Imported notebook: yunishi0716_k-folds-model
✓ Imported n

# Create User Profile Collection

In [2]:
with DataManager() as dm:
    # Create a user
    user_id = dm.users.create(
        username='data_scientist',
        email='ds@example.com',
        experience_level='intermediate'
    )

# Create History Tracking Collection

In [None]:
with DataManager() as dm:
    # Set active competition
    dm.users.set_active_competition(user_id, 'titanic')
    dm.history.initialize_history(user_id, 'titanic')

    # Log some interactions
    dm.history.log_interaction(user_id, "Starting Titanic analysis", is_user=True)
    dm.history.log_interaction(user_id, "Here's some initial guidance", is_user=False)

    # Log a submission
    dm.history.log_submission(user_id, {
        'notebook_id': 'initial_analysis',
        'score': 0.85,
        'notes': 'First submission with basic model'
    })
    
    # advance the user
    dm.history.advance_user(user_id, 'titanic', 'advanced')
    # Log a new interaction
    dm.history.log_interaction(user_id, "Advanced analysis on Titanic dataset", is_user=True)
    # Log a new submission
    dm.history.log_submission(user_id, {
        'notebook_id': 'advanced_analysis',
        'score': 0.90,
        'notes': 'Improved model with feature engineering'
    })
    #complete the history
    dm.history.complete_competition(user_id, 'titanic')
    
    # exit the competition
    dm.users.clear_active_competition(user_id)