In [1]:
import pandas as pd
import numpy as np
import os
import pickle_funcs as pk
import helpers
from matplotlib import pyplot
import sqlite3
out_path = "C:/Users/dasen/Google Drive/SYNC TO PC/"
downloads = "C:/Users/dasen/Downloads/"

# Description

Exploration and setup of data for [kaggle project.](https://www.kaggle.com/c/zillow-prize-1/data)

I want to create a database for this project so that all data can be in a centralized location.

# Load Data

In [31]:
# create list of filenames in my downloads folder
files = [x for x in list(os.walk(downloads))[0][2] if '.csv' in x or '.xlsx' in x]
files

['data_dictionary.xlsx',
 'properties.csv',
 'sample_submission.csv',
 'train.csv']

In [27]:
def write(table_name, df):
    """Writes a dataframe to the database"""
    conn = sqlite3.connect(out_path + 'data998_database.sqlite3')
    df.to_sql(table_name, conn, if_exists='replace')
    conn.close()

## Review and Write Data Dictionary

In [4]:
files[0]

'data_dictionary.xlsx'

In [5]:
%%time
# load and view
df = pd.read_excel(downloads + files[0])
df.columns = ['feature', 'description']
df.head(10)

Wall time: 39 ms


In [6]:
%%time
# write data dictionary 
write('data_dictionary', df)

Wall time: 187 ms


## Review and Write Properties Data

In [7]:
files[1]

'properties.csv'

After first loading this data, I received this error: 

```
<string>:2: DtypeWarning: Columns (22,32,34,49,55) have mixed types. Specify dtype option on import or set low_memory=False
```

Here are the descriptions of each of these columns: 

```
count     69014
unique        1
top        True
freq      69014
Name: hashottuborspa, dtype: object

count     2972940
unique        240
top          0100
freq      1153896
Name: propertycountylandusecode, dtype: object

count     1978629
unique       5638
top          LAR1
freq       275029
Name: propertyzoningdesc, dtype: object

count     5163
unique       1
top       True
freq      5163
Name: fireplaceflag, dtype: object

count     56462
unique        1
top           Y
freq      56462
Name: taxdelinquencyflag, dtype: object
```

For now, I'm just going to load these columns as string columns by using the `dtype` argument in the `read_csv` method. 

In [2]:
# dict to set types of columns 
types = {22: str, 32: str, 34: str, 49: str, 55: str}

In [9]:
%%time
# load and view 
df = pd.read_csv(downloads + files[1], dtype=types)

Wall time: 19.9 s


In [10]:
df.shape

(2985217, 58)

In [11]:
df.sample(10)

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
469357,12671327,,,,1.0,3.0,,7.0,1.0,,...,,,89742.0,448746.0,2015.0,359004.0,5255.4,,,60376510000000.0
1803154,11276311,1.0,,,2.0,3.0,,4.0,2.0,,...,,,161701.0,200046.0,2015.0,38345.0,3255.37,,,60379010000000.0
133329,14387105,,,,4.0,4.0,,,4.0,,...,,,661188.0,1078783.0,2015.0,417595.0,24718.4,,,60590630000000.0
1033746,12511403,1.0,,,2.0,2.0,,4.0,2.0,,...,,,178000.0,465000.0,2015.0,287000.0,5935.71,,,60375750000000.0
2581864,12961882,,,,1.0,3.0,,7.0,1.0,,...,,,24658.0,54010.0,2015.0,29352.0,919.11,,,60374310000000.0
778806,11964208,,,,3.0,5.0,,7.0,3.0,,...,,,35539.0,78757.0,2015.0,43218.0,1340.7,,,60371850000000.0
2169650,14738088,,,,2.0,2.0,,,2.0,,...,,,95320.0,414048.0,2015.0,318728.0,4716.18,,,60590630000000.0
372089,17095529,,,,2.0,5.0,,,2.0,,...,1.0,,178188.0,356376.0,2015.0,178188.0,3765.28,,,61110010000000.0
55562,14025844,,,,2.0,5.0,,,2.0,,...,1.0,,105942.0,622187.0,2015.0,516245.0,7282.08,,,60590640000000.0
2822535,14337927,,,,3.0,4.0,,,3.0,,...,,,207893.0,343810.0,2015.0,135917.0,4507.88,,,60590760000000.0


In [12]:
%%time
# write properties data 
write('properties', df)

Wall time: 2min 29s


## Review and Write Sample Submissions

In [45]:
files[2]

'sample_submission.csv'

In [46]:
%%time
# load and view
df = pd.read_csv(downloads + files[2])

Wall time: 1.55 s


In [47]:
df.head(10)

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0
5,10898347,0,0,0,0,0,0
6,10933547,0,0,0,0,0,0
7,10940747,0,0,0,0,0,0
8,10954547,0,0,0,0,0,0
9,10976347,0,0,0,0,0,0


In [48]:
%%time
# write sample submission data 
write('sample', df)

Wall time: 9.3 s


## Review and Write Sample Submissions

In [38]:
files[3]

'train.csv'

In [39]:
%%time
# load and view
df = pd.read_csv(downloads + files[3])

Wall time: 76.6 ms


In [40]:
df.head(10)

Unnamed: 0,parcelid,logerror,transactiondate
0,11016594,0.0276,2016-01-01
1,14366692,-0.1684,2016-01-01
2,12098116,-0.004,2016-01-01
3,12643413,0.0218,2016-01-02
4,14432541,-0.005,2016-01-02
5,11509835,-0.2705,2016-01-02
6,12286022,0.044,2016-01-02
7,17177301,0.1638,2016-01-02
8,14739064,-0.003,2016-01-02
9,14677559,0.0843,2016-01-03


In [41]:
df.shape

(90275, 3)

In [42]:
%%time
# write training data 
write('train', df)

Wall time: 325 ms


# Review Database

In [49]:
helpers.inspect_database()

[('data_dictionary',), ('properties',), ('train',), ('sample',)]

I want to compare how long it takes to load data from a `csv` versus loading from the database. 

The properties file is very large. From the `csv`, it takes about 20 seconds to load:

In [6]:
%%time
# load and view 
df = pd.read_csv(downloads + 'properties.csv', dtype=types)

Wall time: 20.5 s


And the `sqlite3` database takes about three times as long:

In [3]:
%%time
# load and view from database 
query = """select * from properties"""
df = helpers.query_database(query)

Wall time: 1min 4s
