# Alibaba Dataset Quick Peek

This notebook samples small slices from the Alibaba 2018 and 2020 archives without fully extracting them.


In [1]:
# Ensure this project's src/ is first on sys.path
import sys
from pathlib import Path
repo_root = Path.cwd().parents[1]
sys.path.insert(0, str(repo_root))
print('Using repo root:', repo_root)



Using repo root: /Users/andrewespira/Downloads/st_peters/research-fall2025


In [None]:
from pathlib import Path
import pandas as pd

from src.data import (
    find_alibaba_archives,
    sample_alibaba_2018_archive,
    sample_alibaba_2020_archive,
)

raw_base = Path('../../data/raw')
archives = find_alibaba_archives(raw_base)

print('Found archives:')
for ver, files in archives.items():
    print(f'  {ver}: {len(files)} files')
    for k, v in list(files.items())[:3]:
        print(f'    - {k}: {v.name}')

archives


Found archives:
  2018: 6 files
    - machine_meta: machine_meta.tar.gz
    - machine_usage: machine_usage.tar.gz
    - container_meta: container_meta.tar.gz
  2020: 7 files
    - pai_job_table: pai_job_table.tar.gz
    - pai_task_table: pai_task_table.tar.gz
    - pai_instance_table: pai_instance_table.tar.gz


{'2018': {'machine_meta': PosixPath('../../data/raw/alibaba_2018/machine_meta.tar.gz'),
  'machine_usage': PosixPath('../../data/raw/alibaba_2018/machine_usage.tar.gz'),
  'container_meta': PosixPath('../../data/raw/alibaba_2018/container_meta.tar.gz'),
  'container_usage': PosixPath('../../data/raw/alibaba_2018/container_usage.tar.gz'),
  'batch_instance': PosixPath('../../data/raw/alibaba_2018/batch_instance.tar.gz'),
  'batch_task': PosixPath('../../data/raw/alibaba_2018/batch_task.tar.gz')},
 '2020': {'pai_job_table': PosixPath('../../data/raw/alibaba_2020/pai_job_table.tar.gz'),
  'pai_task_table': PosixPath('../../data/raw/alibaba_2020/pai_task_table.tar.gz'),
  'pai_instance_table': PosixPath('../../data/raw/alibaba_2020/pai_instance_table.tar.gz'),
  'pai_sensor_table': PosixPath('../../data/raw/alibaba_2020/pai_sensor_table.tar.gz'),
  'pai_group_tag_table': PosixPath('../../data/raw/alibaba_2020/pai_group_tag_table.tar.gz'),
  'pai_machine_spec': PosixPath('../../data/raw/ali

In [None]:
# Sample a few rows from 2018 archives (if available)
samples_2018 = {}

if archives.get('2018'):
    for name, path in list(archives['2018'].items())[:3]:
        print(f"\nReading 2018 sample: {name} -> {path.name}")
        try:
            df = sample_alibaba_2018_archive(path, nrows=500)
            print(df.shape, 'columns:', len(df.columns))
            display(df.head(3))
            samples_2018[name] = df
        except Exception as e:
            print('Failed:', e)
else:
    print('No 2018 archives found.')



Reading 2018 sample: machine_meta -> machine_meta.tar.gz
(500, 7) columns: 7


Unnamed: 0,m_1,0,219,17,96,100,USING
0,m_1,148984,219,17,96,100,USING
1,m_1,535156,219,17,96,100,USING
2,m_1,552384,219,17,96,100,USING



Reading 2018 sample: machine_usage -> machine_usage.tar.gz
(500, 9) columns: 9


Unnamed: 0,m_1932,386640,41,92,Unnamed: 4,Unnamed: 5,43.04,33.08,5
0,m_1932,386670,43,92,,,43.04,33.08,5
1,m_1932,386690,44,92,,,43.05,33.08,5
2,m_1932,386800,46,92,,,43.05,33.08,3



Reading 2018 sample: container_meta -> container_meta.tar.gz
(500, 8) columns: 8


Unnamed: 0,c_1,m_2556,0,app_5052,started,400,400.1,1.56
0,c_1,m_2556,287942,app_5052,started,400,400,1.56
1,c_1,m_2556,338909,app_5052,started,400,400,1.56
2,c_2,m_962,0,app_8125,started,800,800,3.13


In [None]:
# Sample a few rows from 2020 archives (if available)
samples_2020 = {}

if archives.get('2020'):
    for name, path in list(archives['2020'].items())[:3]:
        print(f"\nReading 2020 sample: {name} -> {path.name}")
        try:
            df = sample_alibaba_2020_archive(path, nrows=500)
            print(df.shape, 'columns:', len(df.columns))
            display(df.head(3))
            samples_2020[name] = df
        except Exception as e:
            print('Failed:', e)
else:
    print('No 2020 archives found.')



Reading 2020 sample: pai_job_table -> pai_job_table.tar.gz
(500, 6) columns: 6


Unnamed: 0,87a79c6373f7439d82f32a1c,38e2d7187762a9241cc1cd5732dbe52a5d8b37ed7dafe1c1e1bb9f50bfa2,74238accb90b,Running,1053513.0,Unnamed: 5
0,9605ac7cc4c55a193fbe956b,e1cbdf28400847d65d00da4f0522ce7a43275fe9cb5d2a...,61d6b6dd5b15,Running,1097614.0,
1,a2d8872d080eb634a42ea9a6,27ddabc2f7490279c6d5bd95c8a75e5c96d841c6030659...,61d6b6dd5b15,Running,1103820.0,
2,e8baa72bfcd1b723cafab26a,70e4f598c8cdf40f24e0e7d3aba9a9dacde5342e1f2ffd...,61d6b6dd5b15,Terminated,1104396.0,1140852.0



Reading 2020 sample: pai_task_table -> pai_task_table.tar.gz
(500, 10) columns: 10


Unnamed: 0,c936346f45eccd34bf748541,tensorflow,1.0,Terminated,2693235.0,2695847.0,600.0,29.296875,50.0,MISC
0,455c3dec270f4777ad67721c,tensorflow,1.0,Terminated,3399583.0,3399732.0,600.0,29.296875,100.0,MISC
1,ba64aa2f0feff18428923e92,tensorflow,1.0,Terminated,2152271.0,2158213.0,600.0,29.296875,50.0,MISC
2,704783be2a4b7f88b8d2e4ee,worker,100.0,Failed,2172980.0,,600.0,9.765625,10.0,MISC



Reading 2020 sample: pai_instance_table -> pai_instance_table.tar.gz
(500, 9) columns: 9


Unnamed: 0,54670e2998350a9f0e4868d5,worker,c47ee192deea8c5aa87d8d7c2e02120cd03f75304a69dde378ac4abad45c,05b1590648df3f4c6deea2496bbbefd4d3ae93347669f764d466128a150f,68889727c2257f702108d185e5eed342a39807a2893d4bac00987159d00c,Terminated,3272995.0,3273065.0,165261853e188ca69c4fbcdf
0,54670e2998350a9f0e4868d5,worker,92759a73039692b2d3e0929cadf55f5b07b40d6582c36f...,cbc1bc591c53301bceafa44dc626288fd92d2fc5dca438...,68889727c2257f702108d185e5eed342a39807a2893d4b...,Terminated,3272995.0,3273056.0,31dbf829549b10917e7193a6
1,54670e2998350a9f0e4868d5,worker,b3e42abde4c35da06ff4746802b82678b1bf56b28efaa1...,09d41bf627188adc004b2db7c0a47578325952cc3340af...,68889727c2257f702108d185e5eed342a39807a2893d4b...,Terminated,3272995.0,3273067.0,31dbf829549b10917e7193a6
2,54670e2998350a9f0e4868d5,worker,d0f2e1f76ff463bcd08302370cc2f9bf0ec1333a3b25e3...,1a631e8be3fd787c2891c15b9e9bac6df4f177efc9ec7b...,68889727c2257f702108d185e5eed342a39807a2893d4b...,Terminated,3272995.0,3273058.0,3765d45e2eca88d1d1da7cef


In [None]:
# Basic sanity stats if any samples loaded
if samples_2018 or samples_2020:
    def quick_stats(df):
        return {
            'rows': len(df),
            'columns': len(df.columns),
            'null_frac_mean': df.isna().mean().mean(),
        }
    
    summary = {
        '2018': {k: quick_stats(v) for k, v in samples_2018.items()},
        '2020': {k: quick_stats(v) for k, v in samples_2020.items()},
    }
    summary
else:
    print('No samples loaded. Verify archives exist under data/raw/alibaba_* and rerun.')
