A comprehensive human-annotated dataset for SQL AI tasks across diverse domains and complexity levels.
yum install python-pip gcc gcc-c++ python-virtualenv cyrus-sasl-devel
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
sh Miniconda3-latest-Linux-x86_64.sh
conda create --name spiderman-env python=3.12.2
conda activate spiderman-env
pip install -r requirements.txt
brew install mysql pkg-config
pip install mysqlclient
python scripts/insert_dataset.py 'mysql://<username>:<password>@<host>:3306'
python scripts/scan_dataset.py
# Download Spider 1.0 zip into ./source
python ./scripts/download_source.py
# Rebuild dataset from source, and overwrite current SCHEMA and DATA.
python ./scripts/rebuild_dataset.py
# QUERIES would not be rebuild from source as they have been heavily modified to work with non sqlite databases.
If you find this to be useful, please consider citing:
@inproceedings{SpiderMan,
title = {SpiderMan: A Comprehensive Human-Annotated Dataset for SQL AI Tasks Across Diverse Domains and Complexity Levels},
author = {Sreenath Somarajapuram},
year = 2024
}
SpiderMan is a refined version of the Spider dataset.
@inproceedings{Yu&al.18c,
title = {Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task},
author = {Tao Yu and Rui Zhang and Kai Yang and Michihiro Yasunaga and Dongxu Wang and Zifan Li and James Ma and Irene Li and Qingning Yao and Shanelle Roman and Zilin Zhang and Dragomir Radev}
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
year = 2018
}