In [1]:
import os
os.chdir('/home/data/')

from time import time

In [2]:
import numpy as np

import sqlite3
import pandas as pd

from sqlalchemy import create_engine

## Create an empty sqlite database file

In [3]:
con = sqlite3.connect("sqlite.db")

## Create a _cursor_ object to interact with it

In [5]:
cur = con.cursor()

---
# Loading Data into the Database

- This step might take a while if your CSV file is larger than a few GBs, 

- But the benefits outweigh the wait time;
    - you can use `pd.read_sql` tools to pull data from the database without worrying about memory constraints.
    - you can use tools like `Metabase` or any SQL editor to write aggregations and reductions on big data locally.  
    
    
- [Note] Avoid using `SELECT *` as it will load all data into memory. 

- Use `WHERE` statements and the `LIMIT` clause each time.

### Load NYC Taxi Data

In [6]:
files = os.listdir("nyc-taxi/")

In [7]:
def convert_types(COL):
    """
    If the passed COL is numeric,
    downcast it to the lowest size.
    Else,
    Return as-is.
    
    Parameters
    -----------
    COL: pandas.Series
        The Series to shrink
        
    Returns
    -------
    if numeric, a compressed series
    """
    if COL.dtype == np.int64:
        return pd.to_numeric(COL, downcast='integer', errors='ignore')
    elif COL.dtype == np.float64:
        return pd.to_numeric(COL, downcast='float', errors='ignore')
    else:
        return COL

In [None]:
t0 = time()

for f in files:
    """
    Read each csv in chunks
    For each chunk
        Compress
        Load into DB
    """
    print("Reading {}".format(f))
    
    f_chunks = pd.read_csv(
        "nyc-taxi/" + f, 
        chunksize=10**6, 
        error_bad_lines=False,
        parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime']
    )

    for chunk in f_chunks:
        """
        Fill the table by reading a large text file in chunks.
        Each chunk is just a pandas DataFrame
        Filter/transform the data as needed here.
        """
        (chunk
         .apply(convert_types)
         .to_sql(
            name='nyc_taxi', 
            con=con, 
            if_exists='append',
            index=False)
        )
    
print("Loading into db finished in {} seconds.".format(time()-t0))

Reading yellow_tripdata_2017-01.csv


---
## Check if loading went well

In [None]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()

---
## Run SQL queries

In [None]:
pd.read_sql("SELECT count(*) FROM nyc_taxi", con=con)