In [2]:
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')
db = client['Kaggle']
notebooks = db['notebook']

In [None]:
from pymongo import MongoClient
import json
import os
import glob

class NotebookDB:
    def __init__(self, connection_string='mongodb://localhost:27017/', db_name='notebook', collection_name='titanic'):
        """Initialize connection to MongoDB."""
        self.client = MongoClient(connection_string)
        self.db = self.client[db_name]
        self.notebooks = self.db[collection_name]
        
        # Create index on notebook_id for faster lookups
        self.notebooks.create_index('notebook_id', unique=True)
    
    def import_notebook(self, file_path):
        """Import a single Jupyter notebook (.ipynb) file into MongoDB."""
        try:
            # Check if file is a jupyter notebook
            if not file_path.endswith('.ipynb'):
                print(f"Skipping {file_path}: Not a Jupyter notebook file (.ipynb)")
                return False
            
            # Extract filename without extension as notebook_id
            notebook_id = os.path.splitext(os.path.basename(file_path))[0]
            
            # Read the notebook file
            with open(file_path, 'r', encoding='utf-8') as f:
                notebook_content = json.load(f)
            
            # Store the notebook in MongoDB with just ID and content
            self.notebooks.update_one(
                {'notebook_id': notebook_id},
                {
                    '$set': {
                        'notebook_id': notebook_id,
                        'content': notebook_content
                    }
                },
                upsert=True
            )
            
            return True
            
        except Exception as e:
            print(f"Error importing notebook {file_path}: {e}")
            return False
    
    def import_folder(self, folder_path):
        """Import all Jupyter notebooks from a folder into MongoDB.
        
        Args:
            folder_path (str): Path to the folder containing .ipynb files
            
        Returns:
            tuple: (number of successful imports, total number of .ipynb files found)
        """
        # Make sure folder path exists
        if not os.path.isdir(folder_path):
            print(f"Error: {folder_path} is not a valid directory")
            return 0, 0
        
        # Find all .ipynb files in the folder
        notebook_pattern = os.path.join(folder_path, "*.ipynb")
        notebook_files = glob.glob(notebook_pattern)
        
        if not notebook_files:
            print(f"No .ipynb files found in {folder_path}")
            return 0, 0
        
        # Import each notebook
        success_count = 0
        for file_path in notebook_files:
            notebook_id = os.path.splitext(os.path.basename(file_path))[0]
            if self.import_notebook(file_path):
                success_count += 1
                print(f"Imported: {notebook_id}")
            else:
                print(f"Failed to import: {notebook_id}")
        
        print(f"Successfully imported {success_count} of {len(notebook_files)} notebooks")
        return success_count, len(notebook_files)
    
    def get_notebook(self, notebook_id):
        """Retrieve a notebook by its ID."""
        return self.notebooks.find_one({'notebook_id': notebook_id}, {'_id': 0})
    
    def export_notebook(self, notebook_id, output_path):
        """Export a notebook to a .ipynb file."""
        notebook = self.get_notebook(notebook_id)
        if notebook and 'content' in notebook:
            try:
                with open(output_path, 'w', encoding='utf-8') as f:
                    json.dump(notebook['content'], f)
                print(f"Notebook exported to {output_path}")
                return True
            except Exception as e:
                print(f"Error exporting notebook: {e}")
                return False
        else:
            print(f"Notebook {notebook_id} not found or has no content")
            return False
    
    def list_notebooks(self):
        """List all stored notebooks.
        
        Returns:
            list: List of notebook IDs
        """
        return [doc['notebook_id'] for doc in self.notebooks.find({}, {'notebook_id': 1, '_id': 0})]
    
    def search_notebooks(self, keyword):
        """Search for notebooks by ID containing the keyword.
        
        Args:
            keyword (str): The keyword to search for in notebook IDs
            
        Returns:
            list: List of matching notebook IDs
        """
        regex_pattern = {'$regex': keyword, '$options': 'i'}  # Case-insensitive search
        return [doc['notebook_id'] for doc in self.notebooks.find(
            {'notebook_id': regex_pattern}, 
            {'_id': 0, 'notebook_id': 1}
        )]


Stored notebooks (0):


In [None]:
collection_name = 'titanic'
db = NotebookDB(collection_name=collection_name)

notebooks_path = './data/test/notebooks/titanic/'
count = db.import_folder(notebooks_path)
if count == 0:
    print("No notebooks imported")
print(f"Imported {count} notebooks")
db.list_notebooks()

Imported: titanic-cat
Imported: titanic-tutorial
Imported: a-statistical-analysis-ml-workflow-of-titanic
Imported: introduction-to-ensembling-stacking-in-python
Imported: titanic-competition-w-tensorflow-decision-forests
Imported: titanic-xgboost
Imported: a-data-science-framework-to-achieve-99-accuracy
Imported: preliminary-exploration
Imported: exercise-arithmetic-and-variables
Imported: a-guide-to-handling-missing-values-in-python
Imported: titanic-survival-predictions-beginner
Imported: data-analysis-and-feature-extraction-with-python
Imported: titanic-top-4-with-ensemble-modeling
Imported: knowledge-graph-nlp-tutorial-bert-spacy-nltk
Imported: titanic-logistic-regression-with-python
Imported: outlier-the-silent-killer
Imported: why-you-should-use-a-framework
Imported: upura-kaggle-tutorial-01-first-submission
Imported: titanic-linear-logistic-regression-implementation
Imported: titanic-competition-how-top-lb-got-their-score
Imported: titanic-advanced-feature-engineering-tutorial
I

In [42]:
## load a joson file
import json
file_path = './data/test/notebooks/titanic/metadata/all_notebooks_metadata.json'
with open('./data/test/notebooks/titanic/metadata/all_notebooks_metadata.json', 'r', encoding='utf-8') as f:
    meta_data = json.load(f)

In [None]:
import json
import pandas as pd
import numpy as np

def select_notebooks_by_quality(metadata_file, num_notebooks_per_test=5):
    """
    Select notebooks for 5 tests with progressively increasing quality.
    
    Parameters:
    metadata_file (str or dict): Path to the JSON file containing notebook metadata or metadata dict
    num_notebooks_per_test (int): Number of notebooks to select for each test
    
    Returns:
    list: List of lists, where each inner list contains notebook IDs for a test
    """
    # Load metadata if it's a file path
    if isinstance(metadata_file, str):
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)
    else:
        metadata = metadata_file
    
    # Process all entries to handle both URL and non-URL keys
    processed_metadata = {}
    
    # First, process non-URL entries
    for key, value in metadata.items():
        if not key.startswith('https://'):
            processed_metadata[key] = value
    
    # Then add URL entries that don't already exist
    for key, value in metadata.items():
        if key.startswith('https://') and 'notebook_name' in value:
            notebook_name = value['notebook_name']
            if notebook_name not in processed_metadata:
                # Copy the metadata from the URL entry
                processed_metadata[notebook_name] = value
    
    # Convert to DataFrame for easier manipulation
    df = pd.DataFrame.from_dict(processed_metadata, orient='index')
    
    # Replace NaN values with 0 for numeric columns
    df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0)
    df['votes'] = pd.to_numeric(df['votes'], errors='coerce').fillna(0)
    df['comments'] = pd.to_numeric(df['comments'], errors='coerce').fillna(0)
    
    # Calculate max values for normalization
    max_score = df['score'].max()
    max_votes = df['votes'].max()
    max_comments = df['comments'].max()
    
    # Calculate the quality metric
    # Weights: score (0.2), votes (0.5), comments (0.3)
    df['normalized_score'] = df['score'] / max_score if max_score > 0 else 0
    df['normalized_votes'] = df['votes'] / max_votes if max_votes > 0 else 0
    df['normalized_comments'] = df['comments'] / max_comments if max_comments > 0 else 0
    
    df['quality'] = (0.2 * df['normalized_score'] + 
                    0.5 * df['normalized_votes'] + 
                    0.3 * df['normalized_comments'])
    
    # Sort by quality (low to high)
    df = df.sort_values('quality')
    
    # Create 5 test groups with increasing quality
    total_notebooks = len(df)
    notebooks_sorted = df.index.tolist()
    
    # Calculate how many notebooks we need in total
    total_needed = 5 * num_notebooks_per_test
    
    # Make sure we don't request more notebooks than available
    if total_needed > total_notebooks:
        raise ValueError(f"Not enough notebooks. Requested {total_needed} notebooks for 5 tests, but only {total_notebooks} are available.")
    
    # Create equally spaced indices to select notebooks from the sorted list
    # This ensures we get full range of quality from lowest to highest
    step_size = (total_notebooks - 1) / (total_needed - 1) if total_needed > 1 else 0
    indices = [int(i * step_size) for i in range(total_needed)]
    
    # Select notebooks using the calculated indices
    selected_notebooks = [notebooks_sorted[i] for i in indices]
    
    # Divide the selected notebooks into 5 groups
    test_groups = []
    for i in range(5):
        start_idx = i * num_notebooks_per_test
        end_idx = start_idx + num_notebooks_per_test
        test_groups.append(selected_notebooks[start_idx:end_idx])
    
    return test_groups

def get_notebooks_with_quality(metadata_file):
    """
    Get all notebooks with their calculated quality scores.
    Useful for verification and inspection of the quality distribution.
    
    Parameters:
    metadata_file (str or dict): Path to the JSON file containing notebook metadata or metadata dict
    
    Returns:
    DataFrame: DataFrame with notebook IDs and quality scores
    """
    # Load metadata if it's a file path
    if isinstance(metadata_file, str):
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)
    else:
        metadata = metadata_file
    
    # Process all entries to handle both URL and non-URL keys
    processed_metadata = {}
    
    # First, process non-URL entries
    for key, value in metadata.items():
        if not key.startswith('https://'):
            processed_metadata[key] = value
    
    # Then add URL entries that don't already exist
    for key, value in metadata.items():
        if key.startswith('https://') and 'notebook_name' in value:
            notebook_name = value['notebook_name']
            if notebook_name not in processed_metadata:
                # Copy the metadata from the URL entry
                processed_metadata[notebook_name] = value
    
    # Convert to DataFrame
    df = pd.DataFrame.from_dict(processed_metadata, orient='index')
    
    # Replace NaN values with 0 for numeric columns
    df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0)
    df['votes'] = pd.to_numeric(df['votes'], errors='coerce').fillna(0)
    df['comments'] = pd.to_numeric(df['comments'], errors='coerce').fillna(0)
    
    # Calculate max values for normalization
    max_score = df['score'].max()
    max_votes = df['votes'].max()
    max_comments = df['comments'].max()
    
    # Calculate the quality metric
    df['normalized_score'] = df['score'] / max_score if max_score > 0 else 0
    df['normalized_votes'] = df['votes'] / max_votes if max_votes > 0 else 0
    df['normalized_comments'] = df['comments'] / max_comments if max_comments > 0 else 0
    
    df['quality'] = (0.2 * df['normalized_score'] + 
                    0.5 * df['normalized_votes'] + 
                    0.3 * df['normalized_comments'])
    
    # Show number of entries processed
    print(f"Total unique notebooks processed: {len(df)}")
    
    # Sort by quality score
    return df.sort_values('quality')

# Example usage
if __name__ == "__main__":
    # For file input
    # test_groups = select_notebooks_by_quality('all_notebooks_metadata.json', num_notebooks_per_test=5)
    
    # For direct dictionary input
    with open(file_path, 'r') as f:
        metadata_dict = json.load(f)
    
    # Get quality scores for all notebooks (for inspection)
    quality_df = get_notebooks_with_quality(metadata_dict)
    print("Notebooks sorted by quality:")
    for idx, (notebook, row) in enumerate(quality_df.iterrows()):
        print(f"{idx+1}. {notebook}: {row['quality']:.4f}")
    
    print("\n" + "="*50 + "\n")
    
    # Get the 5 test groups
    test_groups = select_notebooks_by_quality(metadata_dict, num_notebooks_per_test=4)
    
    # Print the results
    for i, group in enumerate(test_groups):
        print(f"Test {i+1} (Quality Level: {i+1}/4):")
        for notebook in group:
            quality = quality_df.loc[notebook, 'quality']
            print(f"  - {notebook} (quality: {quality:.4f})")
        print()

In [67]:
test_groups
## save to json
import json
with open('test_groups_for_varing_quality_fixed_size.json', 'w') as f:
    json.dump(test_groups, f, indent=4)

In [66]:
for notebook in test_groups[4]:
    print(notebook)
    # Export the notebook to a .ipynb file
    output_path = f"exported_{notebook}.ipynb"
    db.export_notebook(notebook, output_path)

titanic-survival-predictions-beginner
Notebook exported to exported_titanic-survival-predictions-beginner.ipynb
titanic-advanced-feature-engineering-tutorial
Notebook exported to exported_titanic-advanced-feature-engineering-tutorial.ipynb
titanic-data-science-solutions
Notebook exported to exported_titanic-data-science-solutions.ipynb
titanic-tutorial
Notebook exported to exported_titanic-tutorial.ipynb


In [29]:
import json

def load_and_sort_competitions(file_path):
    try:
        # Read the JSON file
        with open(file_path, 'r') as file:
            competitions = json.load(file)
        
        # Sort competitions by teams_count in descending order
        sorted_competitions = sorted(competitions, key=lambda x: x['teams_count'], reverse=True)
        
        # Extract just the competition names in order
        sorted_competition_names = [comp['competition_name'] for comp in sorted_competitions]
        
        return sorted_competition_names
    except FileNotFoundError:
        return "Error: File not found"
    except json.JSONDecodeError as e:
        return f"Error parsing JSON: {e}"
    except Exception as e:
        return f"Error: {e}"

# Example usage - replace 'competitions.json' with your actual file path
file_path = '/home/xunubuntu/research_projects/Kaggle_RAG_dataset/data/competition_list.json'
result = load_and_sort_competitions(file_path)
print(result)
# save to json
with open('sorted_competitions.json', 'w') as f:
    json.dump(result, f, indent=4)
list_of_selected_competiton = result[:40]
## remove an element from the list
list_of_selected_competiton.remove('playground-series-s4e1')
list_of_selected_competiton.remove('aptos2019-blindness-detection')
list_of_selected_competiton.remove('commonlitreadabilityprize')
len(list_of_selected_competiton)

['titanic', 'home-data-for-ml-course', 'catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2', 'house-prices-advanced-regression-techniques', 'talkingdata-adtracking-fraud-detection', 'cassava-leaf-disease-classification', 'playground-series-s4e10', 'home-credit-credit-risk-model-stability', 'optiver-realized-volatility-prediction', 'mercedes-benz-greener-manufacturing', 'zillow-prize-1', 'jane-street-real-time-market-data-forecasting', 'data-science-bowl-2018', 'commonlitreadabilityprize', 'playground-series-s4e1', 'ashrae-energy-prediction', 'ga-customer-revenue-prediction', 'playground-series-s4e2', 'child-mind-institute-problematic-internet-use', 'godaddy-microbusiness-density-forecasting', 'petfinder-pawpularity-score', 'otto-group-product-classification-challenge', 'data-science-bowl-2019', 'riiid-test-answer-prediction', 'playground-series-s5e2', 'statoil-iceberg-classifier-challenge', 'equity-post-HCT-survival-predictions', 'siim-isic-melanoma-classification

37

In [28]:
import os
for ticker in list_of_selected_competiton:
    path = os.path.join(os.getcwd(), 'data/notebooks', ticker, 'metadata/all_notebooks_metadata.json')
    print(ticker)
    if not os.path.exists(path):
        print(f"Path does not exist: {path}")
        continue
        # Get quality scores for all notebooks (for inspection)
    quality_df = get_notebooks_with_quality(path)
    print("Notebooks sorted by quality:")
    for idx, (notebook, row) in enumerate(quality_df.iterrows()):
        print(f"{idx+1}. {notebook}: {row['quality']:.4f}")
    
    print("\n" + "="*50 + "\n")

    # Get the 5 test groups
    test_groups = select_notebooks_by_quality(path, num_notebooks_per_test=5)
    
    # Print the results
    for i, group in enumerate(test_groups):
        print(f"Test {i+1} (Quality Level: {i+1}/4):")
        for notebook in group:
            quality = quality_df.loc[notebook, 'quality']
            print(f"  - {notebook} (quality: {quality:.4f})")
        print()

titanic
Total unique notebooks processed: 52
Notebooks sorted by quality:
1. huangzhen1997_gender-and-age-based-model: 0.0000
2. lostking91_titanic: 0.0000
3. vodaza36_titanic-competition-my-first-steps: 0.0000
4. am00634_final-anas: 0.0000
5. sedacavdaroglu_predicting-survival-in-titanic: 0.0000
6. window98_my-first-script: 0.0000
7. bolids_test1: 0.0000
8. srikanths31_predict-survival: 0.0000
9. krishnaparvatha_ranregone: 0.0000
10. atmaks_titanic-predictions: 0.0000
11. qvajou_titanic-disaster-first-test: 0.0000
12. lv0123456_titanic-prediction: 0.0000
13. tgandhi_notebook252e72c6e8: 0.0000
14. kylemede_sklearn-randomforest-enhanced: 0.0000
15. kylemede_sklearn-logisticregression-enhanced: 0.0000
16. am00634_final-2: 0.0000
17. gauss102_titanic-test: 0.0000
18. geow812_titanic-with-logistic-regression: 0.0000
19. qianbi_python-logistic: 0.0000
20. pyobro_titanic-survival-classification: 0.0000
21. thibaultbezpalko_titanic-comparison: 0.0000
22. kpisi14_kernel80d261ee0e: 0.0000
23. g