In [1]:
# Step 1: Install the library (Only needed if running in Colab)
!pip install datasets scikit-learn pandas -q

import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split, KFold

# ==========================================
# 1. DOWNLOAD & LOAD DATASET
# ==========================================
# We use 'banking77', a dataset of 13,000+ financial queries.
# This command downloads it from the internet and loads it into memory instantly.
dataset = load_dataset("banking77", split="train")

# Convert it to a Pandas DataFrame (like an Excel sheet) for easy handling
df = pd.DataFrame(dataset)

print(f"âœ… Data Loaded! Total rows: {len(df)}")
print(f"Sample Query: {df['text'][0]} (Label: {df['label'][0]})")

# ==========================================
# 2. 80-20 SPLIT (Holdout Method)
# ==========================================
# We hide 20% of the data (test_df) to grade the AI later.
# We use 80% (train_df) to teach the AI.
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("\n--- ðŸ“Š 80/20 Split Results ---")
print(f"Training Data (80%): {len(train_df)} rows")
print(f"Test Data     (20%): {len(test_df)} rows")

# ==========================================
# 3. 3-FOLD CROSS-VALIDATION
# ==========================================
# We take the TRAINING data and split it 3 different ways to double-check quality.
kf = KFold(n_splits=3, shuffle=True, random_state=42)

print("\n--- ðŸ”„ 3-Fold Cross-Validation ---")
fold = 1
for train_index, val_index in kf.split(train_df):
    # Select the rows for this specific fold
    fold_train = train_df.iloc[train_index]
    fold_val   = train_df.iloc[val_index]

    print(f"Fold {fold}: AI learns from {len(fold_train)} rows, checks itself on {len(fold_val)} rows")
    fold += 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/298k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/93.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3080 [00:00<?, ? examples/s]

âœ… Data Loaded! Total rows: 10003
Sample Query: I am still waiting on my card? (Label: 11)

--- ðŸ“Š 80/20 Split Results ---
Training Data (80%): 8002 rows
Test Data     (20%): 2001 rows

--- ðŸ”„ 3-Fold Cross-Validation ---
Fold 1: AI learns from 5334 rows, checks itself on 2668 rows
Fold 2: AI learns from 5335 rows, checks itself on 2667 rows
Fold 3: AI learns from 5335 rows, checks itself on 2667 rows
