# Intro to SQL

In [1]:
PATH = './assets/datasets/'

## Using sqlite connection

In [2]:
import sqlite3
sqlite_db = 'test_db.sqlite'
conn = sqlite3.connect(sqlite_db)
c = conn.cursor()

In [3]:
c.execute('CREATE TABLE houses (field1 INTEGER PRIMARY KEY, sqft INTEGER, bdrms INTEGER, age INTEGER, price INTEGER);')

# Save (commit) the changes
conn.commit()

In [4]:
last_sale = (None, 4000, 5, 22, 619000)
c.execute('INSERT INTO houses VALUES (?,?,?,?,?)',last_sale)

# Remember to commit the changes
conn.commit()

In [5]:
recent_sales = [
  (None, 2390, 4, 34, 319000),
  (None, 1870, 3, 14, 289000),
  (None, 1505, 3, 90, 269000),
]

c.executemany('INSERT INTO houses VALUES (?, ?, ?, ?, ?)', recent_sales)

conn.commit()

In [6]:
from numpy import genfromtxt

# import into nparray of ints, then convert to list of lists
data = (genfromtxt(PATH + 'housing-data.csv', dtype='i8',
                    delimiter=',', skip_header=1)).tolist()

In [7]:
data[0:3]

[[2104, 3, 70, 399900], [1600, 3, 28, 329900], [2400, 3, 44, 369000]]

In [8]:
# append a None value to beginning of each sub-list
for d in data:
    d.insert(0, None)

In [9]:
data[0:3]

[[None, 2104, 3, 70, 399900],
 [None, 1600, 3, 28, 329900],
 [None, 2400, 3, 44, 369000]]

In [10]:
# loop through data, running an INSERT on each record (i.e. sublist)
for d in data:
    c.execute('INSERT INTO houses VALUES (?, ?, ?, ?, ?)', d)

conn.commit()

In [11]:
# Similar syntax as before
results = c.execute("SELECT * FROM houses WHERE bdrms = 4")

# Here results is a cursor object - use fetchall() to extract a list
results.fetchall()

[(2, 2390, 4, 34, 319000),
 (9, 3000, 4, 75, 539900),
 (10, 1985, 4, 61, 299900),
 (15, 1940, 4, 7, 239999),
 (20, 2300, 4, 77, 449900),
 (23, 2609, 4, 5, 499998),
 (24, 3031, 4, 21, 599000),
 (28, 1962, 4, 53, 259900),
 (37, 2040, 4, 75, 314900),
 (39, 1811, 4, 24, 285900),
 (42, 2132, 4, 28, 345000),
 (43, 4215, 4, 66, 549000),
 (44, 2162, 4, 43, 287000),
 (47, 2567, 4, 57, 314000),
 (50, 1852, 4, 64, 299900)]

## Using pandas

In [12]:
import pandas as pd
from pandas.io import sql

In [13]:
import pandas as pd

data = pd.read_csv(PATH + 'housing-data.csv', low_memory=False)
data.head()

Unnamed: 0,sqft,bdrms,age,price
0,2104,3,70,399900
1,1600,3,28,329900
2,2400,3,44,369000
3,1416,2,49,232000
4,3000,4,75,539900


In [14]:
data.to_sql('houses_pandas',
            con=conn,
            if_exists='replace',
            index=False)

In [15]:
sql.read_sql('select * from houses_pandas limit 5', con=conn)

Unnamed: 0,sqft,bdrms,age,price
0,2104,3,70,399900
1,1600,3,28,329900
2,2400,3,44,369000
3,1416,2,49,232000
4,3000,4,75,539900


## SQL Operators

## SELECT

```SQL
SELECT
<columns>
FROM
<table>
```

In [16]:
sql.read_sql('select * from houses_pandas limit 10', con=conn)

Unnamed: 0,sqft,bdrms,age,price
0,2104,3,70,399900
1,1600,3,28,329900
2,2400,3,44,369000
3,1416,2,49,232000
4,3000,4,75,539900
5,1985,4,61,299900
6,1534,3,12,314900
7,1427,3,57,198999
8,1380,3,14,212000
9,1494,3,15,242500


```SQL
SELECT *
```
denotes returns all of the columns.

### We can also select individual columns

```SQL
SELECT
<col1>, <col2>, <coln>
FROM
<table>
```

In [17]:
sql.read_sql('select age, price from houses_pandas limit 10', con=conn)

Unnamed: 0,age,price
0,70,399900
1,28,329900
2,44,369000
3,49,232000
4,75,539900
5,61,299900
6,12,314900
7,57,198999
8,14,212000
9,15,242500


## Check: Write a query that returns only bedrooms and sq. footage, and price

In [22]:
sql.read_sql('select bdrms, price, sqft from houses_pandas limit 8', con=conn)

Unnamed: 0,bdrms,price,sqft
0,3,399900,2104
1,3,329900,1600
2,3,369000,2400
3,2,232000,1416
4,4,539900,3000
5,4,299900,1985
6,3,314900,1534
7,3,198999,1427


## WHERE

### Where is used to filter the data 

```SQL
SELECT
<columns>
FROM
<table>
WHERE
<condition>
```

### Example

```SQL
SELECT
sqft, bdrms, age, price
FROM houses_pandas
WHERE bdrms = 2 and price < 500000;
```

In [26]:
sql.read_sql('select sqft, bdrms, age, price from houses_pandas where bdrms = 2 and price < 500000', con=conn)

Unnamed: 0,sqft,bdrms,age,price
0,1416,2,49,232000
1,1320,2,62,299900
2,1888,2,79,255000
3,1839,2,40,349900
4,1664,2,40,368500
5,852,2,70,179900


## Check: Write a query that returns the sqft, bdrms, age for houses older than 60 years.

In [29]:
sql.read_sql('select sqft, bdrms, age from houses_pandas where age > 60', con = conn)

Unnamed: 0,sqft,bdrms,age
0,2104,3,70
1,3000,4,75
2,1985,4,61
3,2300,4,77
4,1320,2,62
5,1236,3,78
6,1888,2,79
7,2040,4,75
8,3137,3,67
9,4215,4,66


## AGGREGATIONS

- Average (i.e., arithmetic mean)
- Count
- Maximum
- Minimum
- Median
- Mode
- Sum

```SQL
SELECT COUNT(price)
FROM houses_pandas;
```

In [30]:
sql.read_sql('SELECT COUNT(price) FROM houses_pandas', con=conn)

Unnamed: 0,COUNT(price)
0,47


```SQL
SELECT AVG(sqft), MIN(price), MAX(price)
FROM houses_pandas
WHERE bdrms = 2;
```

In [31]:
sql.read_sql('SELECT AVG(sqft), MIN(price), MAX(price) FROM houses_pandas WHERE bdrms = 2', con=conn)

Unnamed: 0,AVG(sqft),MIN(price),MAX(price)
0,1496.5,179900,368500


## Check: <br><br> Write a query to find the average price per sq ft for one bedroom houses<br> <br> Write another to find the average price per sq ft for those great than 3 bedrooms

In [34]:
sql.read_sql('select bdrms, avg(price/sqft) from houses_pandas where bdrms = 1', con=conn)

Unnamed: 0,bdrms,avg(price/sqft)
0,1,169.0


In [37]:
sql.read_sql('select avg(price/sqft) from houses_pandas where bdrms > 3', con = conn)

Unnamed: 0,avg(price/sqft)
0,156.066667


## Independent Practice

Practice querying the **SQLite database** we've created in class using any of the methods you've learnt so far:

- console connection
- python sqlite3 package
- pandas
- BD Browser for SQLite

Practice querying the **PostgreSQL database** you can find at:

```
url: dsi.c20gkj5cvu3l.us-east-1.rds.amazonaws.com
port: 5432
database: dsi
user: dsi_student
password: gastudents
```

using:

- console connection
- python sqlalchemy package
- pandas
- Postico

Questions:

- What's the average price per room for 1 bedroom apartments?
- What's the average price per room for 2 bedrooms apartments?
- What's the most frequent apartment size (in terms of bedrooms)?
- How many are there of that apartment kind?
- What fraction of the total number are of that kind?
- How old is the oldest 3 bedrooms apartment?
- How old is the youngest apartment?
- What's the average age for the whole dataset?
- What's the average age for each bedroom size?

Try to answer all these in SQL.

If you finish, try completing the first sections of [SQL zoo](http://www.sqlzoo.net/).