# ETL Project - Team 

## Objective
•	Obtain resources from real estate site and school accountability website to extract, transform, and load csv files to conduct an informative analysis in the future.
    
    - Help our clients to find the perfect house with the perfect school


## Type DataBase
•	SQL Postgres

## HAR Data

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from secrets import username, password

In [2]:
# store CSV into DF
csv_file= "./Resources/HAR.csv"
customer_data_df= pd.read_csv(csv_file)
customer_data_df.head()

Unnamed: 0,MLS,Street_Number,Street_Name,City,Zip,County,Subdivision,Home_Type,Year_Built,Bedrooms,...,Style,List_Price,Market_Area,Area,DOM,CDOM,List_Date,School_District,Elementary,High_School
0,70161219,1747,Forest Hill,Houston,77023,Harris,Forest Hill,Single-Family,1952,3,...,Traditional,255000,4,East End Revitalized,5,5,2/1/2021,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
1,17580127,1911,Pasadena,Houston,77023,Harris,Forest Hill,Single-Family,1949,2,...,Traditional,324900,4,East End Revitalized,74,74,11/24/2020,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
2,35404452,1931,Santa Rosa,Houston,77023,Harris,Forest Hill,Single-Family,1949,3,...,"Contemporary/Modern, Traditional",319000,4,East End Revitalized,53,53,12/11/2020,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
3,12503971,2022,Forest Hill,Houston,77023,Harris,Forest Hill,Single-Family,1938,4,...,Traditional,399000,4,East End Revitalized,3,3,2/3/2021,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
4,32263872,6707,Avenue I,Houston,77011,Harris,Central Park,Single-Family,2020,3,...,Traditional,280990,4,University Area,134,251,9/25/2020,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL


In [3]:
# Total info
customer_data_df.shape

(3241, 28)

In [4]:
# Columns list
customer_data_df.columns

Index(['MLS', 'Street_Number', 'Street_Name', 'City', 'Zip', 'County',
       'Subdivision', 'Home_Type', 'Year_Built', 'Bedrooms', 'Full_Baths',
       'Half_Baths', 'Total_Baths', 'Room_Count', 'Fireplaces', 'Stories',
       'Pool_Private', 'Garages', 'Style', 'List_Price', 'Market_Area', 'Area',
       'DOM', 'CDOM', 'List_Date', 'School_District', 'Elementary',
       'High_School'],
      dtype='object')

In [5]:
# Navigate a single column
customer_data_df['Fireplaces']

0       0
1       1
2       0
3       1
4       0
       ..
3236    0
3237    0
3238    0
3239    0
3240    1
Name: Fireplaces, Length: 3241, dtype: int64

In [6]:
# Replace null values with "0"
customer_data_df['Fireplaces']= customer_data_df['Fireplaces'].fillna(0)

In [7]:
# Check replacement
customer_data_df['Fireplaces']

0       0
1       1
2       0
3       1
4       0
       ..
3236    0
3237    0
3238    0
3239    0
3240    1
Name: Fireplaces, Length: 3241, dtype: int64

In [8]:
# Replace null values with "0"
customer_data_df['Garages']= customer_data_df['Garages'].fillna(0)

In [9]:
# Checking unique values
customer_data_df.nunique()

MLS                3241
Street_Number      2151
Street_Name        1841
City                 10
Zip                  67
County                3
Subdivision        1699
Home_Type             1
Year_Built          112
Bedrooms             11
Full_Baths           10
Half_Baths            7
Total_Baths          38
Room_Count           26
Fireplaces            8
Stories               5
Pool_Private          2
Garages              16
Style                79
List_Price         1228
Market_Area          18
Area                 58
DOM                 355
CDOM                572
List_Date           359
School_District       2
Elementary          167
High_School          36
dtype: int64

In [10]:
# Checking unique values
customer_data_df.nunique().count()

28

In [11]:
# isna/isnull
customer_data_df.isna().sum()

MLS                0
Street_Number      0
Street_Name        0
City               0
Zip                0
County             0
Subdivision        1
Home_Type          0
Year_Built         0
Bedrooms           0
Full_Baths         0
Half_Baths         0
Total_Baths        0
Room_Count         0
Fireplaces         0
Stories            0
Pool_Private       0
Garages            0
Style              0
List_Price         0
Market_Area        0
Area               0
DOM                0
CDOM               0
List_Date          0
School_District    0
Elementary         0
High_School        0
dtype: int64

In [12]:
# Data types
customer_data_df.dtypes

MLS                  int64
Street_Number        int64
Street_Name         object
City                object
Zip                  int64
County              object
Subdivision         object
Home_Type           object
Year_Built           int64
Bedrooms             int64
Full_Baths           int64
Half_Baths           int64
Total_Baths        float64
Room_Count           int64
Fireplaces           int64
Stories              int64
Pool_Private          bool
Garages              int64
Style               object
List_Price           int64
Market_Area          int64
Area                object
DOM                  int64
CDOM                 int64
List_Date           object
School_District     object
Elementary          object
High_School         object
dtype: object

In [13]:
# Create new table
HAR_df= customer_data_df
HAR_df.head()

Unnamed: 0,MLS,Street_Number,Street_Name,City,Zip,County,Subdivision,Home_Type,Year_Built,Bedrooms,...,Style,List_Price,Market_Area,Area,DOM,CDOM,List_Date,School_District,Elementary,High_School
0,70161219,1747,Forest Hill,Houston,77023,Harris,Forest Hill,Single-Family,1952,3,...,Traditional,255000,4,East End Revitalized,5,5,2/1/2021,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
1,17580127,1911,Pasadena,Houston,77023,Harris,Forest Hill,Single-Family,1949,2,...,Traditional,324900,4,East End Revitalized,74,74,11/24/2020,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
2,35404452,1931,Santa Rosa,Houston,77023,Harris,Forest Hill,Single-Family,1949,3,...,"Contemporary/Modern, Traditional",319000,4,East End Revitalized,53,53,12/11/2020,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
3,12503971,2022,Forest Hill,Houston,77023,Harris,Forest Hill,Single-Family,1938,4,...,Traditional,399000,4,East End Revitalized,3,3,2/3/2021,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
4,32263872,6707,Avenue I,Houston,77011,Harris,Central Park,Single-Family,2020,3,...,Traditional,280990,4,University Area,134,251,9/25/2020,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL


In [14]:
HAR_df.to_csv("./Resources/clean_har.csv", index = False)

## TEA Data

In [15]:
# store CSV into DF
csv_file= "./Resources/School_rating.csv"
school_rating_df= pd.read_csv(csv_file)
school_rating_df.head()

Unnamed: 0,Campus_Number,School,District,Yrs_Unacceptable,Rating_2019
0,101902001,ALDINE HIGH SCHOOL,ALDINE ISD,0,C
1,101902099,ALDINE J J A E P,ALDINE ISD,0,Not Rated
2,101902041,ALDINE MIDDLE,ALDINE ISD,0,D
3,101902101,ANDERSON ACADEMY,ALDINE ISD,1,F
4,101902015,AVALOS P-TECH SCHOOL,ALDINE ISD,0,Not Rated


In [16]:
school_rating_df.to_csv('./Resources/clean_ratings.csv', index = False)

In [17]:
# Duplicate School
dupl_school_rating= school_rating_df.pivot_table(index= ['School'], aggfunc= 'size')
print(dupl_school_rating)

School
ACADEMY OF CHOICE                     1
ADAM ELEMENTARY                       1
ADAMS J H                             1
ALAMO ELEMENTARY                      1
ALCOTT ELEMENTARY                     1
                                     ..
YOUNG ELEMENTARY                      1
YOUNG LEARNERS                        1
YOUNG SCHOLARS ACADEMY FOR EXCELLE    1
YOUNG WOMEN'S COLLEGE PREP ACADEMY    1
ZELMA HUTSELL ELEMENTARY              1
Length: 655, dtype: int64


In [18]:
# Duplicate School sorting descending
dupl_school_rating.sort_values(ascending=False)

School
ACADEMY OF CHOICE                1
PATTERSON ELEMENTARY             1
NORTHSIDE HIGH SCHOOL            1
NOTTINGHAM COUNTRY ELEMENTARY    1
NOTTINGHAM ELEMENTARY            1
                                ..
GRAY ELEMENTARY                  1
GREEN VALLEY ELEMENTARY          1
GREENSPOINT ELEMENTARY           1
GREGG ELEMENTARY                 1
ZELMA HUTSELL ELEMENTARY         1
Length: 655, dtype: int64

# Load to SQL

In [19]:
# Connect to local database
rds_connection_string = f"{username}:{password}@localhost:5432/etlproject_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [20]:
# Check for tables
engine.table_names()

[]

In [21]:
# Use pandas to load csv converted DataFrame into database table 1 school_rating_df
school_rating_df.to_sql(name='school_rating', con=engine, if_exists='append', index=False)

In [22]:
# Confirm data has been added by querying the school_rating table
pd.read_sql_query('select * from school_rating', con=engine).head()

Unnamed: 0,Campus_Number,School,District,Yrs_Unacceptable,Rating_2019
0,101902001,ALDINE HIGH SCHOOL,ALDINE ISD,0,C
1,101902099,ALDINE J J A E P,ALDINE ISD,0,Not Rated
2,101902041,ALDINE MIDDLE,ALDINE ISD,0,D
3,101902101,ANDERSON ACADEMY,ALDINE ISD,1,F
4,101902015,AVALOS P-TECH SCHOOL,ALDINE ISD,0,Not Rated


In [23]:
# Use pandas to load csv converted DataFrame into database table 2 HAR_df
HAR_df.to_sql(name='mls', con=engine, if_exists='append', index=False)

In [24]:
# Confirm data has been added by querying the mls table
pd.read_sql_query('select * from mls', con=engine).head()

Unnamed: 0,MLS,Street_Number,Street_Name,City,Zip,County,Subdivision,Home_Type,Year_Built,Bedrooms,...,Style,List_Price,Market_Area,Area,DOM,CDOM,List_Date,School_District,Elementary,High_School
0,70161219,1747,Forest Hill,Houston,77023,Harris,Forest Hill,Single-Family,1952,3,...,Traditional,255000,4,East End Revitalized,5,5,2/1/2021,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
1,17580127,1911,Pasadena,Houston,77023,Harris,Forest Hill,Single-Family,1949,2,...,Traditional,324900,4,East End Revitalized,74,74,11/24/2020,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
2,35404452,1931,Santa Rosa,Houston,77023,Harris,Forest Hill,Single-Family,1949,3,...,"Contemporary/Modern, Traditional",319000,4,East End Revitalized,53,53,12/11/2020,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
3,12503971,2022,Forest Hill,Houston,77023,Harris,Forest Hill,Single-Family,1938,4,...,Traditional,399000,4,East End Revitalized,3,3,2/3/2021,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
4,32263872,6707,Avenue I,Houston,77011,Harris,Central Park,Single-Family,2020,3,...,Traditional,280990,4,University Area,134,251,9/25/2020,27 - Houston,BRISCOE ELEMENTARY,AUSTIN HIGH SCHOOL
