In [1]:
import os
os.chdir("/home/data/")

In [2]:
import subprocess as sbp
import sqlite3

import pandas as pd
import numpy as np

In [3]:
run_on_bash = lambda i: sbp.check_output("{}".format(i), shell=True).decode('utf-8')

## Download and Unzip Data

In [7]:
os.chdir('2018-02-26-archive-complete/')

## Check files, initialize DB

In [10]:
FILES = filter(lambda i: 'txt' in i, os.listdir(os.getcwd()))
dict_files = {}

for file in FILES:
    rows_ = int(run_on_bash("wc -l {}".format(file)).split(" ")[0])
    size_ = os.path.getsize(file)/10**6
    dict_files[file] = {
        'rows': rows_,
        'size': size_
    }
    print("{:25} is {:10.2f} MB and has {:10.0f} rows".format(file, size_, rows_))

description.txt           is      61.14 MB and has    1911368 rows
distribution.txt          is      25.64 MB and has     758526 rows
reference.txt             is     285.28 MB and has    2000302 rows
speciesprofile.txt        is       9.32 MB and has     489478 rows
taxa.txt                  is    1558.43 MB and has    3756418 rows
vernacular.txt            is      18.37 MB and has     425294 rows


In [13]:
conn = sqlite3.connect("life.db")

In [14]:
curs = conn.cursor()

## Import files into DB with pandas

In [41]:
for file in dict_files:
    """
    For each file, check its size in MB
    If smaller than 250, read directly
    If larger, read in chunks
    Load the file into the database (.db file)
    """
    if dict_files.get(file).get('size') < 250:
        print("{} is a small file. Importing directly.".format(file))
        df_ = pd.read_csv(
            file, 
            sep="\t",
            low_memory=False,
            error_bad_lines=False
        )
        df_.to_sql(
            name=file.replace(".txt", ''), 
            con=conn, 
            index=False,
            if_exists='append'
        )
        print("Done.")
    else:
        print("{} is large. Importing in chunks.".format(file))
        size = int(np.ceil(dict_files.get(file).get('rows')/10))
        chunks = pd.read_csv(
            file, 
            sep="\t", 
            chunksize=size, 
            error_bad_lines=False,
            low_memory=False
        )
        for c in chunks:
            c.to_sql(
                name=file.replace(".txt", ''), 
                con=conn, 
                index=False,
                if_exists='append'
            )
        print("Done")    

description.txt is a small file. Importing directly.
Done.
distribution.txt is a small file. Importing directly.
Done.
reference.txt is large. Importing in chunks.
Done
speciesprofile.txt is a small file. Importing directly.
Done.
taxa.txt is large. Importing in chunks.
Done
vernacular.txt is a small file. Importing directly.
Done.


## Check DB

In [51]:
print("The database is {:.2f} MB in size".format(os.path.getsize('life.db')/10**6))

The database is 2236.95 MB in size


In [43]:
curs.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()

[('description',),
 ('distribution',),
 ('reference',),
 ('speciesprofile',),
 ('taxa',),
 ('vernacular',)]

## Run Queries

In [17]:
pd.read_sql_query(
    sql="SELECT * FROM taxa limit 10",
    con=conn
).isnull().mean()

index                       0.0
taxonID                     0.0
identifier                  0.5
datasetID                   0.0
datasetName                 0.0
acceptedNameUsageID         0.5
parentNameUsageID           0.5
taxonomicStatus             0.0
taxonRank                   0.0
verbatimTaxonRank           1.0
scientificName              0.0
kingdom                     0.0
phylum                      0.5
class                       0.5
order                       0.5
superfamily                 0.5
family                      0.5
genericName                 0.0
genus                       0.0
subgenus                    1.0
specificEpithet             0.0
infraspecificEpithet        1.0
scientificNameAuthorship    0.0
source                      1.0
namePublishedIn             1.0
nameAccordingTo             0.5
modified                    1.0
description                 1.0
taxonConceptID              1.0
scientificNameID            0.0
references                  0.0
isExtinc