# Extract Titanic Disaster Data from Kaggle


In [1]:
# to use credentials in local .env we need to use this package
!pip install python-dotenv

[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
from dotenv import load_dotenv, find_dotenv

In [3]:
# walk the directories to get our env info
dotenv_path = find_dotenv()
#load up the entries as environment vairables 
load_dotenv(dotenv_path)

True

In [4]:
# now let's get the username etc
import os
KAGGLE_USERNAME = os.environ.get("KAGGLE_USERNAME")
print(KAGGLE_USERNAME)
KAGGLE_PASSWORD = os.environ.get("KAGGLE_PASSWORD")
print(KAGGLE_PASSWORD)

graemer1975
wat3rfall


In [5]:
# and that's how we get env info from our local machine for development, without sharing it on github by accident


In [6]:
import requests
from requests import session
import os
from dotenv import load_dotenv, find_dotenv


### Downloading the test and training data from Kaggle

In [7]:
# login then download...
payload = {
    'action':'login',
    'username': os.environ.get("KAGGLE_USERNAME"),
    'password': os.environ.get("KAGGLE_PASSWORD")
}


# url for the training file - we could scrape this, but let's keep it simple right now
url = 'https://www.kaggle.com/c/titanic/download/train.csv'
loginUrl = "https://www.kaggle.com/account/login"

## We have to login and accept all the checkboxes manually 

#set up the http session to connect and get the data
with session() as c:
    # post request
    # c.post("https://www.kaggle.com/account/login", data=payload)
    
    #NB this only works if we use antiforgery token as discussed in comments
    response = c.get(loginUrl).text
    AFToken = response[response.index('antiForgeryToken')+19:response.index('isAnonymous: ')-12]
    #print("AntiForgeryToken={}".format(AFToken))
    payload['__RequestVerificationToken']=AFToken
    c.post(loginUrl + '?IsModal=true&returnUrl=/', data=payload)
    # get request
    response = c.get(url)    
    #print the response
    #print(response.text)  #commented out, but it works

In [8]:
# sweet! we've got the train data - this works

### download and store the data locally

In [9]:
from requests import session
# payload for login
# login then download...
payload = {
    'action':'login',
    'username': os.environ.get("KAGGLE_USERNAME"),
    'password': os.environ.get("KAGGLE_PASSWORD")
}

loginUrl = "https://www.kaggle.com/account/login"

def extract_data(url, file_path):
    ''' 
    extract data from kaggle
    '''
    #as before, do some antiforgery on the session
    #set up the http session to connect and get the data
    with session() as c:
        response = c.get(loginUrl).text
        AFToken = response[response.index('antiForgeryToken')+19:response.index('isAnonymous: ')-12]
        payload['__RequestVerificationToken']=AFToken
        c.post(loginUrl + '?IsModal=true&returnUrl=/', data=payload)
        # get request
        #response = c.get(url)    
        with open(file_path, 'wb') as handle:  #python3 needs wb, python2 only w
            response = c.get(url, stream=True)
            #print(response.text)
            for block in response.iter_content(1024):  #capture the data in k sized chunks
                handle.write(block)

In [10]:
# data source URLS
train_url = 'https://www.kaggle.com/c/titanic/download/train.csv'
test_url = 'https://www.kaggle.com/c/titanic/download/test.csv'

#file paths for storing the data
raw_data_path = os.path.join(os.path.pardir,'data','raw')
train_data_path = os.path.join(raw_data_path,'train.csv')
test_data_path = os.path.join(raw_data_path,'test.csv')

#extract the data
extract_data(train_url, train_data_path)
extract_data(test_url, test_data_path)




In [11]:
#is the data there?
!ls -l ../data/raw

total 176
-rw-r--r--  1 graemerenfrew  staff  28629 25 Jul 12:30 test.csv
-rw-r--r--  1 graemerenfrew  staff  61194 25 Jul 12:30 train.csv


In [None]:
# Excellent - this pulls the data down from Kaggle and stores it locally

# Build a script to do this
### We want to be able to get this data via the command line, without having to use a jupyter notebook

In [14]:
get_raw_data_script_file = os.path.join(os.path.pardir,'src','data','get_raw_data.py')

In [15]:
%%writefile $get_raw_data_script_file
# -*- coding: utf-8 -*-
import requests
from requests import session
import os
from dotenv import load_dotenv, find_dotenv
import logging #so we can show users what's happening

# login then download...
payload = {
    'action':'login',
    'username': os.environ.get("KAGGLE_USERNAME"),
    'password': os.environ.get("KAGGLE_PASSWORD")
}

loginUrl = "https://www.kaggle.com/account/login"

def extract_data(url, file_path):
    ''' 
    extract data from kaggle
    '''
    #as before, do some antiforgery on the session
    #set up the http session to connect and get the data
    with session() as c:
        response = c.get(loginUrl).text
        AFToken = response[response.index('antiForgeryToken')+19:response.index('isAnonymous: ')-12]
        payload['__RequestVerificationToken']=AFToken
        c.post(loginUrl + '?IsModal=true&returnUrl=/', data=payload)
        # get request
        with open(file_path, 'wb') as handle:  #python3 needs wb, python2 only w
            response = c.get(url, stream=True)
            #print(response.text)
            for block in response.iter_content(1024):  #capture the data in k sized chunks
                handle.write(block)

def main(project_dir):
    '''
    main method
    '''
    #get a logger
    logger = logging.getLogger(__name__)
    logger.info('getting the raw data')
    
    # data source URLS
    train_url = 'https://www.kaggle.com/c/titanic/download/train.csv'
    test_url = 'https://www.kaggle.com/c/titanic/download/test.csv'
    
    #file paths for storing the data
    raw_data_path = os.path.join(os.path.pardir,'data','raw')
    train_data_path = os.path.join(raw_data_path,'train.csv')
    test_data_path = os.path.join(raw_data_path,'test.csv')

    #extract the data
    extract_data(train_url, train_data_path)
    extract_data(test_url, test_data_path)
    logger.info('downloaded raw training and test data')
    
if __name__ == '__main__':
    # get the root directory   - pardir is 'parent directory'
    project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
    
    #set up logger
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)
    
    #now find the .env automatically by walking the directories
    dotenv_path = find_dotenv()
    #load the variables
    load_dotenv(dotenv_path)
    
    #call the main methods
    main(project_dir)


Writing ../src/data/get_raw_data.py


In [17]:
#now call the script via a shell command - remember to use python3, not just python
!python3 $get_raw_data_script_file

2018-07-25 12:50:32,157 - __main__ - INFO - getting the raw data
2018-07-25 12:50:38,054 - __main__ - INFO - downloaded raw training and test data


In [18]:
# WHOOOOP