In [1]:
import os
os.chdir('/home/data/')

from time import time

In [2]:
import sqlite3
import pandas as pd

## Create an empty sqlite database file

In [3]:
!rm tutorial.db

In [4]:
con = sqlite3.connect("tutorial.db")

## Create a _cursor_ object to interact with it

In [5]:
cur = con.cursor()

### Load KDD Data

In [6]:
kdd_names = \
(pd.read_csv("kddcup.names", skiprows=1, header=None)
 .loc[:, 0]
 .map(lambda i: i.split(":")[0])
 .values
 .tolist()
)

In [7]:
kdd_chunks = pd.read_csv(
    "kddcup.data", 
    names=kdd_names, 
    chunksize=10**6, 
    error_bad_lines=False
)

In [8]:
t0 = time()
for chunk in kdd_chunks:
    """
    Fill the table by reading a large text file in chunks.
    Each chunk is just a pandas DataFrame
    Filter/transform the data as needed here.
    """
    chunk.to_sql(
        name='kdd', 
        con=con, 
        if_exists='append',
        index=False
    )
    
print("Loading into db finished in {} seconds.".format(time()-t0))

Loading into db finished in 74.79250454902649 seconds.


### Load Flights Data

In [9]:
flights_chunks = pd.read_csv(
    'flights.csv', 
    chunksize=10**6,
    error_bad_lines=False
)

In [10]:
t0 = time()

for chunk in flights_chunks:
    """
    Fill the table by reading a large text file in chunks.
    Each chunk is just a pandas DataFrame
    Filter/transform the data as needed here.
    """
    chunk.to_sql(
        name='flightDelays', 
        con=con, 
        if_exists='append',
        index=False
    )
    
print("Loading into db finished in {} seconds.".format(time()-t0))

Loading into db finished in 93.90438771247864 seconds.


## Check if loading went well

In [13]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()

[('kdd',), ('flightDelays',)]

## Run SQL queries

In [15]:
pd.read_sql("SELECT Origin,Dest FROM flightDelays LIMIT 10", con=con)

Unnamed: 0,Origin,Dest
0,SMF,ONT
1,SMF,PDX
2,SMF,PDX
3,SMF,PDX
4,SMF,PDX
5,SMF,PDX
6,SMF,PHX
7,SMF,PHX
8,SMF,PHX
9,SMF,PHX


In [16]:
pd.read_sql("SELECT count(*) FROM kdd", con=con)

Unnamed: 0,count(*)
0,4898431


In [39]:
pd.DataFrame(None, columns=kdd_names).to_csv('kdd_clean.csv', index=False)