## Homework #13: Extract-Transform-Load 
Purpose of our ETL is to prepare data sets to answer the question:
     *Is the price of soybeans dependent on global fuel prices or weather in top-producing states of Iowa and Illinois?*

In [1]:
# Import libraries for use in Python programs
import pandas as pd
from sqlalchemy import create_engine
import datetime as dt

###  Extraction step of our ETL project:
* Submit request to NOAA Climate Data Center for weather station data in Illinois for the years 2014 to 2019
* Receive CSV files in email in either six month or one year increments due to max size allowance of 1GB for a single pull
* Concatenate all of the files into one large DataFrame using "glob" method

>**What is import glob?**
> An asterisk (*) matches zero or more characters in a segment of a name. For example, dir/*. import glob for name in glob. > > > glob('dir/*'): print name. The pattern matches every pathname (file or directory) in the directory dir, without recursing > > > further into subdirectories.

In [2]:
import glob

In [10]:
illinois_df = pd.concat(map(pd.read_csv, glob.glob("../input/illinois/*.csv")))
illinois_df = illinois_df.fillna(0)

In [11]:
print(len(illinois_df))

1259052


In [12]:
illinois_df.head()

Unnamed: 0,STATION,NAME,DATE,PRCP,TOBS
0,US1ILCR0012,"MOUNT CARROLL 6.8 NNW, IL US",1/10/2014,0.0,0.0
1,US1ILCR0012,"MOUNT CARROLL 6.8 NNW, IL US",1/11/2014,0.37,0.0
2,US1ILCR0012,"MOUNT CARROLL 6.8 NNW, IL US",1/12/2014,0.0,0.0
3,US1ILCR0012,"MOUNT CARROLL 6.8 NNW, IL US",1/13/2014,0.0,0.0
4,US1ILCR0012,"MOUNT CARROLL 6.8 NNW, IL US",1/14/2014,0.03,0.0


In [5]:
illinois_df.tail()

Unnamed: 0,STATION,NAME,DATE,PRCP,TOBS
57265,USC00117487,"ROSICLARE 5 NW, IL US",9/26/2019,0.05,65.0
57266,USC00117487,"ROSICLARE 5 NW, IL US",9/27/2019,0.03,55.0
57267,USC00117487,"ROSICLARE 5 NW, IL US",9/28/2019,0.0,65.0
57268,USC00117487,"ROSICLARE 5 NW, IL US",9/29/2019,0.0,66.0
57269,USC00117487,"ROSICLARE 5 NW, IL US",9/30/2019,0.0,65.0


In [6]:
# What are the data types in the DataFrame
illinois_df.dtypes

STATION     object
NAME        object
DATE        object
PRCP       float64
TOBS       float64
dtype: object

In [7]:
## Convert "Date" from a string to a date-time-group variable
illinois_df["DATE"] =  pd.to_datetime(illinois_df["DATE"])
illinois_df.dtypes

STATION            object
NAME               object
DATE       datetime64[ns]
PRCP              float64
TOBS              float64
dtype: object

In [8]:
# Rename the column headers
illinois_df = illinois_df.rename(columns={"DATE": "Date",
                                                          "PRCP": "Precipitation",
                                                          "TOBS": "Temp_of_observation "})
illinois_df.head()

Unnamed: 0,STATION,NAME,Date,Precipitation,Temp_of_observation
0,US1ILCR0012,"MOUNT CARROLL 6.8 NNW, IL US",2014-01-10,0.0,0.0
1,US1ILCR0012,"MOUNT CARROLL 6.8 NNW, IL US",2014-01-11,0.37,0.0
2,US1ILCR0012,"MOUNT CARROLL 6.8 NNW, IL US",2014-01-12,0.0,0.0
3,US1ILCR0012,"MOUNT CARROLL 6.8 NNW, IL US",2014-01-13,0.0,0.0
4,US1ILCR0012,"MOUNT CARROLL 6.8 NNW, IL US",2014-01-14,0.03,0.0


In [9]:
illinois_transformed = illinois_df.groupby(['Date']).mean()
illinois_transformed

Unnamed: 0_level_0,Precipitation,Temp_of_observation
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-01-01,0.067158,3.965392
2014-01-02,0.183380,2.313485
2014-01-03,0.017668,0.276867
2014-01-04,0.007070,5.591682
2014-01-05,0.271125,3.369004
2014-01-06,0.246179,-2.473684
2014-01-07,0.000000,-1.098077
2014-01-08,0.000095,1.657143
2014-01-09,0.007721,3.235405
2014-01-10,0.069322,6.879473


In [13]:
print(len(illinois_transformed))

2099
