# ETL Project - Team 

## Objective
•	Obtain resources from real estate site and school accountability website to extract, transform, and load csv files to conduct an informative analysis in the future.
    
    - Help our clients to find the perfect house with the perfect school


## Type DataBase
•	SQL Postgres

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

In [2]:
# store CSV into DF
csv_file= "./Resources/HAR.csv"
customer_data_df= pd.read_csv(csv_file)
customer_data_df.head()

Unnamed: 0,MLS,Home_Type,Street_Number,Street_Name,City,Zip,County,List_Price,Market_Area,Subdivision,...,Total_Baths,Room_Count,Fireplaces,Stories,Pool_Private,Garages,Style,DOM,CDOM,List_Date
0,15140876,Single-Family,1707,Hopper,Houston,77093,Harris,139900,34,Westfield Estates,...,1.0,5,0.0,1.0,False,0.0,Traditional,333,333,3/10/2020
1,76230997,Single-Family,1705,Hopper,Houston,77093,Harris,289900,34,Westfield Estates,...,1.0,4,0.0,1.0,False,2.0,Traditional,335,335,3/8/2020
2,91994173,Single-Family,3025,Cedar Hill,Houston,77093,Harris,159900,34,Greenwood Village,...,2.0,6,,1.0,False,,Traditional,92,92,11/6/2020
3,26012702,Single-Family,7538,Meadowyork,Houston,77037,Harris,235000,34,York Meadows,...,2.0,8,,1.0,False,,Traditional,103,103,10/26/2020
4,73504725,Single-Family,612,John Alber,Houston,77076,Harris,179000,34,Roosevelt,...,3.0,8,,1.0,False,2.0,Other Style,4,4,1/8/2021


In [3]:
# Total info
customer_data_df.shape

(3241, 28)

In [4]:
# Columns list
customer_data_df.columns

Index(['MLS', 'Home_Type', 'Street_Number', 'Street_Name', 'City', 'Zip',
       'County', 'List_Price', 'Market_Area', 'Subdivision', 'Area',
       'School_District', 'Elementary', 'High_School', 'Year_Built',
       'Bedrooms', 'Full_Baths', 'Half_Baths', 'Total_Baths', 'Room_Count',
       'Fireplaces', 'Stories', 'Pool_Private', 'Garages', 'Style', 'DOM',
       'CDOM', 'List_Date'],
      dtype='object')

In [5]:
# # Rename column headers
# customer_data_df.rename(columns= {})

In [5]:
# Navigate a single column
customer_data_df['Fireplaces']

0       0.0
1       0.0
2       NaN
3       NaN
4       NaN
       ... 
3236    NaN
3237    NaN
3238    NaN
3239    NaN
3240    NaN
Name: Fireplaces, Length: 3241, dtype: float64

In [6]:
# Replace null values with "0"
customer_data_df['Fireplaces']= customer_data_df['Fireplaces'].fillna(0)

In [7]:
# Check replacement
customer_data_df['Fireplaces']

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
3236    0.0
3237    0.0
3238    0.0
3239    0.0
3240    0.0
Name: Fireplaces, Length: 3241, dtype: float64

In [10]:
# Replace null values with "0"
customer_data_df['Garages']= customer_data_df['Fireplaces'].fillna(0)

, Schools (2019 from Troy's file), Features, Sales_Data, School_Rating

##### Location
table 1
Index(['MLS', 'Street_Number', 'Street_Name', 'Unit', 'City',
       'Zip', 'County', 'Subdivision'])

##### Schools
table 2
Index(['MLS', School_District', 'Elementary', 'High_School'])

##### Home_Features
table 3
Index(['MLS','Year_Built', 'Bedrooms', 'Full_Baths', 'Half_Baths', 'Total_Baths', 'Room_Count',
       'Fireplaces', 'Stories', 'Pool_Private', 'Garages', 'Style', 'Home_Type'])

##### Sales_Data
Table 4
Index(['MLS','List_Price', 'Market_Area', 'Area', 'DOM', 'CDOM', 'List_Date'])

##### School_Rating
Table 5
Load from Troy's file

In [11]:
# Checking unique values
customer_data_df.nunique()

MLS                3241
Home_Type             1
Street_Number      2168
Street_Name        1832
City                 10
Zip                  68
County                3
List_Price         1228
Market_Area          18
Subdivision        1699
Area                 58
School_District       1
Elementary          165
High_School          36
Year_Built          111
Bedrooms             11
Full_Baths           10
Half_Baths            7
Total_Baths          38
Room_Count           26
Fireplaces            8
Stories               9
Pool_Private          2
Garages               8
Style                79
DOM                 355
CDOM                572
List_Date           359
dtype: int64

In [12]:
# isna/isnull
customer_data_df.isna().sum()

MLS                 0
Home_Type           0
Street_Number       0
Street_Name         0
City                0
Zip                 0
County              0
List_Price          0
Market_Area         0
Subdivision         1
Area                0
School_District     0
Elementary         20
High_School        20
Year_Built         25
Bedrooms            0
Full_Baths          0
Half_Baths          0
Total_Baths         0
Room_Count          0
Fireplaces          0
Stories             0
Pool_Private        0
Garages             0
Style               0
DOM                 0
CDOM                0
List_Date           0
dtype: int64

In [13]:
# is not na sum
customer_data_df.notna().sum()

MLS                3241
Home_Type          3241
Street_Number      3241
Street_Name        3241
City               3241
Zip                3241
County             3241
List_Price         3241
Market_Area        3241
Subdivision        3240
Area               3241
School_District    3241
Elementary         3221
High_School        3221
Year_Built         3216
Bedrooms           3241
Full_Baths         3241
Half_Baths         3241
Total_Baths        3241
Room_Count         3241
Fireplaces         3241
Stories            3241
Pool_Private       3241
Garages            3241
Style              3241
DOM                3241
CDOM               3241
List_Date          3241
dtype: int64

In [14]:
# Summary of data
customer_data_df.describe()

Unnamed: 0,MLS,Zip,List_Price,Market_Area,Year_Built,Bedrooms,Full_Baths,Half_Baths,Total_Baths,Room_Count,Fireplaces,Stories,Garages,DOM,CDOM
count,3241.0,3241.0,3241.0,3241.0,3216.0,3241.0,3241.0,3241.0,3241.0,3241.0,3241.0,3241.0,3241.0,3241.0,3241.0
mean,50447330.0,77023.42456,689004.4,13.054921,1987.474192,3.338784,2.553533,0.685591,2.622092,7.582536,0.542117,1.916569,0.542117,78.177723,148.143783
std,28046280.0,1354.01626,810209.5,7.453523,32.809881,0.887501,1.050714,0.620786,1.079703,3.55565,0.764268,0.890571,0.764268,96.719579,194.616732
min,242718.0,77.0,63900.0,2.0,1900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,26275090.0,77008.0,305000.0,9.0,1956.0,3.0,2.0,0.0,2.0,5.0,0.0,1.0,0.0,17.0,26.0
50%,50561570.0,77023.0,439000.0,16.0,2003.0,3.0,2.0,1.0,2.1,7.0,0.0,2.0,0.0,48.0,89.0
75%,74571080.0,77057.0,745000.0,17.0,2020.0,4.0,3.0,1.0,3.1,10.0,1.0,2.1,1.0,106.0,190.0
max,98990940.0,77521.0,14500000.0,34.0,2021.0,11.0,10.0,6.0,10.2,29.0,7.0,4.0,7.0,1244.0,2307.0


In [15]:
# Data types
customer_data_df.dtypes

MLS                  int64
Home_Type           object
Street_Number       object
Street_Name         object
City                object
Zip                  int64
County              object
List_Price           int64
Market_Area          int64
Subdivision         object
Area                object
School_District     object
Elementary          object
High_School         object
Year_Built         float64
Bedrooms             int64
Full_Baths           int64
Half_Baths           int64
Total_Baths        float64
Room_Count           int64
Fireplaces         float64
Stories            float64
Pool_Private          bool
Garages            float64
Style               object
DOM                  int64
CDOM                 int64
List_Date           object
dtype: object

In [16]:
# Create new table
HAR_df= customer_data_df
HAR_df.head()

Unnamed: 0,MLS,Home_Type,Street_Number,Street_Name,City,Zip,County,List_Price,Market_Area,Subdivision,...,Total_Baths,Room_Count,Fireplaces,Stories,Pool_Private,Garages,Style,DOM,CDOM,List_Date
0,15140876,Single-Family,1707,Hopper,Houston,77093,Harris,139900,34,Westfield Estates,...,1.0,5,0.0,1.0,False,0.0,Traditional,333,333,3/10/2020
1,76230997,Single-Family,1705,Hopper,Houston,77093,Harris,289900,34,Westfield Estates,...,1.0,4,0.0,1.0,False,0.0,Traditional,335,335,3/8/2020
2,91994173,Single-Family,3025,Cedar Hill,Houston,77093,Harris,159900,34,Greenwood Village,...,2.0,6,0.0,1.0,False,0.0,Traditional,92,92,11/6/2020
3,26012702,Single-Family,7538,Meadowyork,Houston,77037,Harris,235000,34,York Meadows,...,2.0,8,0.0,1.0,False,0.0,Traditional,103,103,10/26/2020
4,73504725,Single-Family,612,John Alber,Houston,77076,Harris,179000,34,Roosevelt,...,3.0,8,0.0,1.0,False,0.0,Other Style,4,4,1/8/2021
