# Extracting Titanic Disaster Data from Kaggle

### Install Kaggle API

In [6]:
!pip install --user kaggle



### Download Competition Datasets from Kaggle

In [18]:
import os
raw_data_path = os.path.join(os.path.pardir, 'data', 'raw')
print(raw_data_path)

..\data\raw


Make sure **kaggle.json** file is placed under *%userprofile%\.kaggle* directory

In [19]:
!kaggle competitions download titanic -f train.csv -p $raw_data_path --force
!kaggle competitions download titanic -f test.csv -p $raw_data_path --force

Downloading train.csv to ..\data\raw




  0%|          | 0.00/59.8k [00:00<?, ?B/s]
100%|██████████| 59.8k/59.8k [00:00<00:00, 2.19MB/s]


Downloading test.csv to ..\data\raw




  0%|          | 0.00/28.0k [00:00<?, ?B/s]
100%|██████████| 28.0k/28.0k [00:00<00:00, 1.91MB/s]


### Building Rerunnable Data Extraction Script

In [8]:
get_raw_data_script_file = os.path.join(os.path.pardir, 'src', 'data', 'get_raw_data.py')

In [26]:
%%writefile $get_raw_data_script_file
from subprocess import call
import os
import logging

def main():
    # get logger
    logger = logging.getLogger(__name__)
    logger.info('getting raw data')
    
    # set path of the raw data
    raw_data_path = os.path.join(os.path.pardir, 'data', 'raw')
    
    # use Kaggle API to download the raw data
    logger.info('downloading train data')
    train_args = ['kaggle', 'competitions', 'download', 'titanic', '-f', 'train.csv', '-p', raw_data_path, '--force']
    call(train_args)
    logger.info('downloading train data completed')
    
    logger.info('downloading test data')
    test_args = ['kaggle', 'competitions', 'download', 'titanic', '-f', 'test.csv', '-p', raw_data_path, '--force']
    call(test_args)
    logger.info('downloading test data completed')

if __name__ == '__main__':
    # set up logger
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level = logging.INFO, format = log_fmt)
    
    # call the main method
    main()

Overwriting ..\src\data\get_raw_data.py


In [27]:
!python $get_raw_data_script_file

Downloading train.csv to ..\data\raw

Downloading test.csv to ..\data\raw



2018-11-02 21:36:56,226 - __main__ - INFO - getting raw data
2018-11-02 21:36:56,226 - __main__ - INFO - downloading train data

  0%|          | 0.00/59.8k [00:00<?, ?B/s]
100%|██████████| 59.8k/59.8k [00:00<00:00, 2.36MB/s]
2018-11-02 21:36:57,805 - __main__ - INFO - downloading train data completed
2018-11-02 21:36:57,805 - __main__ - INFO - downloading test data

  0%|          | 0.00/28.0k [00:00<?, ?B/s]
100%|██████████| 28.0k/28.0k [00:00<00:00, 1.94MB/s]
2018-11-02 21:36:58,690 - __main__ - INFO - downloading test data completed
