In [None]:
## Local Data ETL

### Instructions

* Create a `customer_db` database in pgAdmin 4 then create the following two tables within:

  * A `premise` table that contains the columns `id`, `premise_name` and `county_id`.

  * A `county` table that contains the columns `id`, `county_name`, `license_count` and `county_id`.

  * Be sure to assign a primary key, as Pandas will not be able to do so.

* In Jupyter Notebook perform all ETL.

* **Extraction**

  * Put each CSV into a pandas DataFrame.

* **Transform**

  * Copy only the columns needed into a new DataFrame.

  * Rename columns to fit the tables created in the database.

  * Handle any duplicates. **HINT:** some locations have the same name but each license number is unique.

  * Set index to the previously created primary key.

* **Load**

  * Create a connection to database.

  * Check for a successful connection to the database and confirm that the tables have been created.

  * Append DataFrames to tables. Be sure to use the index set earlier.

* Confirm successful **Load** by querying database.

* Join the two tables and select the `id` and `premise_name` from the `premise` table and `county_name` from the `county` table.


In [73]:
import pandas as pd
from sqlalchemy import create_engine
import datetime as dt


In [74]:
#### Extract CSVs into DataFrames

In [75]:
import glob
iowa_df = pd.concat(map(pd.read_csv, glob.glob("./iowa/*.csv")))
iowa_df = iowa_df.fillna(0)



In [76]:
iowa_df

Unnamed: 0,STATION,NAME,DATE,PRCP,TOBS
0,USC00135650,"MONTEZUMA 1 W, IA US",1/2/2014,0.08,0.0
1,USC00135650,"MONTEZUMA 1 W, IA US",1/14/2014,0.05,0.0
2,USC00135650,"MONTEZUMA 1 W, IA US",1/21/2014,0.06,0.0
3,USC00135650,"MONTEZUMA 1 W, IA US",2/5/2014,0.22,0.0
4,USC00135650,"MONTEZUMA 1 W, IA US",2/13/2014,0.02,0.0
5,USC00135650,"MONTEZUMA 1 W, IA US",2/14/2014,0.06,0.0
6,USC00135650,"MONTEZUMA 1 W, IA US",2/15/2014,0.04,0.0
7,USC00135650,"MONTEZUMA 1 W, IA US",2/16/2014,0.10,0.0
8,USC00135650,"MONTEZUMA 1 W, IA US",2/18/2014,0.20,0.0
9,USC00135650,"MONTEZUMA 1 W, IA US",2/20/2014,0.23,0.0


In [77]:
iowa_df.dtypes

STATION     object
NAME        object
DATE        object
PRCP       float64
TOBS       float64
dtype: object

In [84]:
iowa_transformed = iowa_df.groupby(['STATION']).mean()

In [85]:
iowa_transformed

Unnamed: 0_level_0,PRCP,TOBS
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1
US1IAAL0004,0.167129,0.000000
US1IAAL0005,0.145805,0.000000
US1IAAL0006,0.143371,0.000000
US1IAAP0004,0.297528,0.000000
US1IAAP0007,0.162941,0.000000
US1IAAP0008,0.182266,0.000000
US1IAAP0009,0.159650,0.000000
US1IAAP0010,0.252742,0.000000
US1IAAP0011,0.154903,0.000000
US1IABC0002,0.108231,0.000000


In [87]:
# Rename the column headers
iowa_transformed = iowa_transformed.rename(columns={"STATION": "Station",
                                                          "PRCP": "Precipitation",
                                                          "TOBS": "Temp_of_observation "})
iowa_transformed.head()


Unnamed: 0_level_0,Precipitation,Temp_of_observation
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1
US1IAAL0004,0.167129,0.0
US1IAAL0005,0.145805,0.0
US1IAAL0006,0.143371,0.0
US1IAAP0004,0.297528,0.0
US1IAAP0007,0.162941,0.0


In [None]:
###Create database connection