# Brooklyn Home Sales - ETL Process
**by Group 4**


This plan illustrates how we will transform the original data and ingest it into the new PostgreSQL database. In order to do this, we will go through the following steps:
1. Connect to the PostgreSQL server and database
2. Load and inspect the original data
3. Transform the original data into necessary tables
4. Ingest the transformed data into the PostgreSQL database and the respective tables

First, we will import the required packages.

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

We will now establish the connection with our PostgreSQL server and the database, as well as create the tables based on the normalized schema we designed.

In [None]:
## Pass the connection string to a variable, conn_url
conn_url = 'postgresql://postgres:5ho7gvcd@f19server.apan5310.com:50204/brooklyn_home'

# Create an engine that connects to PostgreSQL server
engine = create_engine(conn_url)

# Establish a connection
connection = engine.connect()

############################################################################################################
# Pass the SQL statements that create all tables
stmt = """
CREATE TABLE owners (
             owner_id int,
             owner_name	varchar(100) NOT NULL,
             owner_type	char(1),
             PRIMARY KEY (owner_id)
             );

CREATE TABLE addresses (
             address_id int,
             address varchar(100) NOT NULL,
             zipcode int,
             lon numeric(10,6),
             lat numeric(10,6),
             PRIMARY KEY (address_id)
             );
             

CREATE TABLE neighborhoods (
             neighborhood_id int,
             neighborhood varchar(100) NOT NULL,
             borough varchar(60) NOT NULL,
             PRIMARY KEY (neighborhood_id)
             );

CREATE TABLE building_class_categories (
             building_class_category_id int,
             building_class_category varchar(100) NOT NULL,
             PRIMARY KEY (building_class_category_id)
             );

CREATE TABLE building_classes (
             building_class_id int,
             building_class char(2) NOT NULL,
             PRIMARY KEY (building_class_id)
);

CREATE TABLE building_class_categories_combinations (
             building_class_id int,
             building_class_category_id int,
             PRIMARY KEY (building_class_id, building_class_category_id),
             FOREIGN KEY (building_class_id) REFERENCES building_classes (building_class_id),
             FOREIGN KEY (building_class_category_id) REFERENCES building_class_categories (building_class_category_id) 
             );

CREATE TABLE tax_classes (
             tax_class_id int,
             tax_class varchar(10) NOT NULL,
             PRIMARY KEY (tax_class_id)
             );

CREATE TABLE community_districts (
             community_district_id int,
             community_district int NOT NULL,
             population int,
             area_sq_mi numeric(3,1),
             male_perc numeric(3,1),
             female_perc numeric(3,1),
             foreign_born_pop numeric(3,1),
             majority_race_pop varchar(10),
             access_to_parks int,
             commute_to_work_in_min numeric(3,1),
             education_bachelor_level numeric(3,1),
             CHECK (majority_race_pop IN ('White','Black','Hispanic','Other')),
             PRIMARY KEY (community_district_id)
             );

CREATE TABLE council_districts (
             city_council_district_id int,
             city_council_district int NOT NULL,
             PRIMARY KEY (city_council_district_id)
             );

CREATE TABLE school_districts (
             school_district_id int,
             school_district int NOT NULL,
             students_enrolled int,
             nr_schools int,
             school_dist_address_id int,
             PRIMARY KEY (school_district_id),
             FOREIGN KEY (school_dist_address_id) REFERENCES addresses (address_id)
             );

CREATE TABLE fire_companies (
             fire_company_id int,
             fire_company char(4) NOT NULL,
             fire_comp_address_id int,
             PRIMARY KEY (fire_company_id),
             FOREIGN KEY (fire_comp_address_id) REFERENCES addresses (address_id)
            );

CREATE TABLE police_precincts (
             police_precinct_id int,
             police_precinct int NOT NULL,
             police_prct_address_id int,
             phone_number varchar(12),
             PRIMARY KEY (police_precinct_id),
             FOREIGN KEY (police_prct_address_id) REFERENCES addresses (address_id)
             );

CREATE TABLE properties (
             property_id int,
             address_id int,
             neighborhood_id int NOT NULL,
             building_class_category_id int,
             building_class_id int,
             tax_class_id int,
             community_district_id int,
             city_council_district_id int,
             school_district_id int,
             fire_company_id int,
             police_precinct_id int,
             land_sqft numeric(10,1) NOT NULL,
             total_units int NOT NULL,
             res_units int NOT NULL,
             comm_units int NOT NULL,
             block int NOT NULL,
             lot int NOT NULL,
             PRIMARY KEY (property_id),
             FOREIGN KEY (neighborhood_id) REFERENCES neighborhoods (neighborhood_id),
             FOREIGN KEY (address_id) REFERENCES addresses (address_id),
             FOREIGN KEY (building_class_category_id) REFERENCES building_class_categories (building_class_category_id),
             FOREIGN KEY (building_class_id) REFERENCES building_classes (building_class_id),
             FOREIGN KEY (community_district_id) REFERENCES community_districts (community_district_id),
             FOREIGN KEY (city_council_district_id) REFERENCES council_districts (city_council_district_id),
             FOREIGN KEY (school_district_id) REFERENCES school_districts (school_district_id),
             FOREIGN KEY (fire_company_id) REFERENCES fire_companies (fire_company_id),
             FOREIGN KEY (police_precinct_id) REFERENCES police_precincts (police_precinct_id),
             FOREIGN KEY (tax_class_id) REFERENCES tax_classes (tax_class_id)
             );

CREATE TABLE property_sales (
             sale_id int,
             property_id int NOT NULL,
             sale_price numeric (20,1) NOT NULL,
             sale_date date NOT NULL,
             year_of_sale numeric(4,0) NOT NULL,
             tax_class_id int,
             apartment_number varchar(20),
             gross_sqft numeric(20,1) NOT NULL,
             PRIMARY KEY (sale_id),
             FOREIGN KEY (property_id) REFERENCES properties (property_id),
             FOREIGN KEY (tax_class_id) REFERENCES tax_classes (tax_class_id)
             );

CREATE TABLE owns (
             owner_id int,
             property_id int,
             PRIMARY KEY (owner_id, property_id),
             FOREIGN KEY (owner_id) REFERENCES owners (owner_id),
             FOREIGN KEY (property_id) REFERENCES properties (property_id)
             );

"""

connection.execute(stmt)


**Loading and Inspecting the Dataset**<br>
In this step, we will load the csv with the original dataset and look into its structure in order for us to accurately transform it into the necessary tables for the new relational database.

In [2]:
#read in the original dataset as df
df=pd.read_csv("brooklyn.csv")

We also load additional csv files that contain information on community districts, school districts, fire stations, police precincts.
This information was scraped from NY government official websites. We then merge these tables with the original dataset. 

In [3]:
#add community_districts_info, school_district_info, fire_companies_info tables
cd_df=pd.read_csv('community_district_info.csv')
sd_df=pd.read_csv('school_districts_info.csv')
fc_df=pd.read_csv('Brooklyn_Fire_Stations.csv')
pc_df=pd.read_csv('police_precincts_info.csv')

#merge with original df
df = pd.merge(df, cd_df, how='left',
                    left_on='CD', right_on='community_district')
df = pd.merge(df, sd_df, how='left',
                    left_on='SchoolDist', right_on='school_district')
df = pd.merge(df, fc_df, how='left',
                    left_on='FireComp', right_on='fire_company')
df = pd.merge(df, pc_df, how='left',
                    left_on='PolicePrct', right_on='police_precinct')

In [4]:
#inspect the final df
df.head()

Unnamed: 0,address,total_units,land_sqft,residential_units,commercial_units,block,lot,sale_price,sale_date,year_of_sale,...,nr_schools,school_district_address,school_district_zipcode,fire_company,fire_company_address,fire_company_zipcode,police_precinct,police_precinct_address,police_precinct_zipcode,phone_number
0,21 CLARK STREET,1,20267,0,1,230,1,202500000,10/31/2017,2017,...,41.0,355 Park Ave,11238.0,E205,74 Middagh Street,11201.0,84.0,301 Gold Street,11201.0,718-875-6811
1,16 COURT STREET,102,12500,0,102,250,44,171000000,10/10/2017,2017,...,41.0,355 Park Ave,11238.0,E205,74 Middagh Street,11201.0,84.0,301 Gold Street,11201.0,718-875-6811
2,20 NORTH 12 STREET,0,60400,0,0,2287,16,160000000,4/19/2017,2017,...,38.0,215 Heyward St,11206.0,,,,94.0,100 Messerole Avenue,11222.0,718-383-3879
3,55 PROSPECT STREET,2,20704,0,2,63,1,138106368,3/31/2017,2017,...,41.0,355 Park Ave,11238.0,,,,84.0,301 Gold Street,11201.0,718-875-6811
4,90 SANDS STREET,1,21175,0,1,87,9,135000000,8/29/2017,2017,...,41.0,355 Park Ave,11238.0,,,,84.0,301 Gold Street,11201.0,718-875-6811


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23955 entries, 0 to 23954
Data columns (total 49 columns):
address                     23955 non-null object
total_units                 23955 non-null int64
land_sqft                   23955 non-null int64
residential_units           23955 non-null int64
commercial_units            23955 non-null int64
block                       23955 non-null int64
lot                         23955 non-null int64
sale_price                  23955 non-null int64
sale_date                   23955 non-null object
year_of_sale                23955 non-null int64
apartment_number            6419 non-null object
gross_sqft                  23955 non-null int64
zip_code                    23955 non-null int64
building_class_category     23955 non-null object
building_class              23942 non-null object
OwnerType                   2829 non-null object
OwnerName                   17886 non-null object
tax_class                   23942 non-null object
XCo

**Transforming Dataset into Tables**<br>
Based on our normalization plan and final schema design, we will now be creating the separate tables from the original data that we uploaded in the previous step. Once the data has been transformed to fit each table, we will load it up into the PosgreSQL database and the respective tables.

### Owners Table

The owners table includes information on the names of the owners of the properties in our dataset, as well as their types.
To create this table, the following steps were taken:

In [7]:
#create temp table with OwnerName and OwnerType from original dataset
temp_own=df[['OwnerName','OwnerType']]

#remove duplicates
owners_df=temp_own.drop_duplicates(subset='OwnerName', keep='first').dropna()

#set new names for columns
owners_df.columns=['owner_name', 'owner_type']

#create an owner_id
owners_df.insert(0, 'owner_id', range(1, 1 + len(owners_df)))

#inject table into sql database
owners_df.to_sql(name='owners', con=engine, if_exists='append', index=False)

print("Owners Table Loaded")

Owners Table Loaded


### Addresses Table

The address table is an aggregated table of all types of addresses in our original dataset. The original dataset includes
addresses on properties, school districts, police precincts and fire companies. <br>
Additionally, these addresses were geocoded using the Census Geocoder to get their latitude and longitude coordinates.<br>
The geocoded addresses were merged with the original addresses to form the Addresses table.
The following steps are takento create this table:

In [9]:
#create temporary dataframes for all types of addresses in the original dataset
temp_add1=df[['address','zip_code']]
temp_add2=df[['school_district_address','school_district_zipcode']]
temp_add3=df[['police_precinct_address','police_precinct_zipcode']]
temp_add4=df[['fire_company_address','fire_company_zipcode']]

#drop duplicates
temp_add1=temp_add1.drop_duplicates()
temp_add2=temp_add2.drop_duplicates()
temp_add3=temp_add3.drop_duplicates()
temp_add4=temp_add4.drop_duplicates()

#drop nas
temp_add1=temp_add1.dropna()
temp_add2=temp_add2.dropna()
temp_add3=temp_add3.dropna()
temp_add4=temp_add4.dropna()

#rename columns to be the same across all dfs
temp_add1.columns=['address','zipcode']
temp_add2.columns=['address','zipcode']
temp_add3.columns=['address','zipcode']
temp_add4.columns=['address','zipcode']

#concatenate all dfs into one address_df
frames=[temp_add1, temp_add2,temp_add3,temp_add4]
address_df=pd.concat(frames)

#insert address_id
address_df.insert(0, 'address_id', range(1, 1 + len(address_df)))

#load lat/lon coordinates to address table
add_geo=pd.read_csv('addresses_geocoded.csv')

address_df=pd.merge(address_df, add_geo, how='left',
                   left_on='address_id', right_on='address_id')

#extract only necessary columns
address_df=address_df[['address_id','geocoded_address','zipcode','lon','lat']]

#rename columns to match database schema
address_df.rename(columns={'geocoded_address':'address'},inplace=True)

#inject into sql database
address_df.to_sql(name='addresses', con=engine, if_exists='append', index=False)

print('Address Table Loaded')

Address Table Loaded


### Neighborhood Table

The neighborhoods table includes all the different neighborhoods the properties in our dataset are situated in. It also includes borough information to consider any future potential scale-ups of this database.

In [10]:
#create temp df with neighborhood  and borough info
temp_neigh=df[['neighborhood','borough']]

#drop duplicates
temp_neigh=temp_neigh.drop_duplicates(keep='first')

#drop nas
temp_neigh=temp_neigh.dropna()
neighborhood_df=temp_neigh

#insert neighborhood_id
neighborhood_df.insert(0, 'neighborhood_id', range(1, 1 + len(neighborhood_df)))

#inject into sql database
neighborhood_df.to_sql(name='neighborhoods', con=engine, if_exists='append', index=False)

print('Neighborhood Table Loaded')

Neighborhood Table Loaded


### Building Class Categories Table

The Building Classification is used to describe a property’s constructive use. The building class categories are used so that the user can easily identify similar properties by broad usage. The building class category combination table captures this relationship. The following steps were taken to create these tables: 


In [11]:
#create temp class category df
temp_bcc=df[['building_class_category']]

#drop duplicates
temp_bcc=temp_bcc.drop_duplicates(keep='first')

#drop nas
temp_bcc=temp_bcc.dropna()
building_class_category_df=temp_bcc

#insert building_class_category_id 
building_class_category_df.insert(0, 'building_class_category_id', range(1, 1 + len(building_class_category_df)))

#inject into sql database
building_class_category_df.to_sql(name='building_class_categories', con=engine, if_exists='append', index=False)

print('Building Class Category Table Loaded')

Building Class Category Table Loaded


### Building Classes Table

In [12]:
#create temp df
temp_bc=df[['building_class']]

#drop duplicates
temp_bc=temp_bc.drop_duplicates(keep='first')

#drop nas
temp_bc=temp_bc.dropna()
building_class_df=temp_bc

#insert building_class_id
building_class_df.insert(0, 'building_class_id', range(1, 1 + len(building_class_df)))

#inject into sql database
building_class_df.to_sql(name='building_classes', con=engine, if_exists='append', index=False)

print('Building Class Table Loaded')

Building Class Table Loaded


### Building Class Category Combinations Table

In [13]:
#create temp df
temp_bcc_combo=df[['building_class','building_class_category']]

#drop duplicates
temp_bcc_combo=temp_bcc_combo.drop_duplicates(keep='first')

#drop nas
temp_bcc_combo=temp_bcc_combo.dropna()
building_class_categories_combinations_df=temp_bcc_combo

#left join/merge with building_class_categories_df and building_class_df on id's
building_class_categories_combinations_df = pd.merge(building_class_categories_combinations_df,
                                                     building_class_df, how='left',
                                                     left_on='building_class', right_on='building_class')
building_class_categories_combinations_df = pd.merge(building_class_categories_combinations_df,
                                                     building_class_category_df, how='left',
                                                     left_on='building_class_category', right_on='building_class_category')

#extract necessary columns per database schema
building_class_categories_combinations_df=building_class_categories_combinations_df[['building_class_id','building_class_category_id']]

#inject into sql database
building_class_categories_combinations_df.to_sql(name='building_class_categories_combinations', con=engine, if_exists='append', index=False)

print('Building_Class_Categories Combinations Table Loaded')

Building_Class_Categories Combinations Table Loaded


### Tax Class Table

Every property in the city is assigned to a tax class,
based on the use of the property. The steps to create the tax class table include:

In [14]:
#create temp df
temp_tax=df[['tax_class']]

#drop duplicates
temp_tax=temp_tax.drop_duplicates(keep='first')

#drop nas
temp_tax=temp_tax.dropna()
tax_class_df=temp_tax

#insert tax_class_id
tax_class_df.insert(0, 'tax_class_id', range(1, 1 + len(tax_class_df)))

#inject into sql database
tax_class_df.to_sql(name='tax_classes', con=engine, if_exists='append', index=False)

print('Tax Class Table Loaded')

Tax Class Table Loaded


### Community Districts Table

The original dataset only included the names of the community districts for each property. Additional information was added to the dataset to capture more qualitative and quantitative insights on community districts for future analysis. The steps to create this table include:

In [15]:
#create temp table with columns from original df
temp_cd=df[['community_district','population','area_sq_mi','male_perc','female_perc','foreign_born_pop',
            'majority_race_pop','access_to_parks','commute_to_work_in_min','education_bachelor_level']]

#drop duplicates
temp_cd=temp_cd.drop_duplicates(keep='first')

#drop nas
temp_cd=temp_cd.dropna()
community_district_df=temp_cd

#insert community_district_id
community_district_df.insert(0, 'community_district_id', range(1, 1 + len(community_district_df)))

#update data type of community_district and population
community_district_df["community_district"] = community_district_df["community_district"].astype(np.int64)
community_district_df['population']=community_district_df['population'].astype(np.int64)

#inject into sql database
community_district_df.to_sql(name='community_districts', con=engine, if_exists='append', index=False)

print('Community District Table Loaded')

Community District Table Loaded


### Council Districts Table

This table describes the council district where the property stays in. The steps to create this table include:

In [16]:
#create temp df
temp_counc=df[['Council']]

#drop duplicates
temp_counc=temp_counc.drop_duplicates(keep='first')

#drop nas
temp_counc=temp_counc.dropna()
council_districts_df=temp_counc

#rename column per database schema
council_districts_df.columns=['city_council_district']

#insert city_council_district_id
council_districts_df.insert(0, 'city_council_district_id', range(1, 1 + len(council_districts_df)))

#turn datatype of council district
council_districts_df["city_council_district"] = council_districts_df["city_council_district"].astype(np.int64)

#inject into sql database
council_districts_df.to_sql(name='council_districts', con=engine, if_exists='append', index=False)

print('Council Districts Table Loaded')

Council Districts Table Loaded


### School Districts Table

This table describes the school districts where the property belongs to. The steps to create this table include:

In [17]:
#create temp df
temp_sc=df[['school_district','students_enrolled','nr_schools','school_district_address',
            'school_district_zipcode']]

#drop duplicates
temp_sc=temp_sc.drop_duplicates(keep='first')

#drop nas
temp_sc=temp_sc.dropna()
school_districts_df=temp_sc

#turn datatype of school district to numeric
school_districts_df["school_district"] = school_districts_df["school_district"].astype(np.int64)

#left join with address table to have the address id added in
school_districts_df = pd.merge(school_districts_df, address_df, how='left',
                         left_on='school_district_address', right_on='address')

#add school_district_id
school_districts_df.insert(0, 'school_district_id', range(1, 1 + len(school_districts_df)))

#extract only necessary columns per database schema
school_districts_df=school_districts_df[['school_district_id','school_district','students_enrolled',
                                        'nr_schools','address_id']]

#rename columns per database schema
school_districts_df.rename(columns={'address_id':'school_dist_address_id'},inplace=True)

#inject into sql database
school_districts_df.to_sql(name='school_districts', con=engine, if_exists='append', index=False)

print('School Districts Table Loaded')

School Districts Table Loaded


### Fire Companies Table

This table describes the fire companies near the property. The original dataset only included the name of the fire company, but additional information was added to make this table useful for further analysis. The steps to create this table include:

In [18]:
#create temp df
temp_fire=df[['fire_company','fire_company_address']]

#drop duplicates
temp_fire=temp_fire.drop_duplicates(keep='first')

#drop nas
temp_fire=temp_fire.dropna()
fire_companies_df=temp_fire

#add fire_company_address_id by left joining with address_df table
fire_companies_df = pd.merge(fire_companies_df, address_df, how='left',
                         left_on='fire_company_address', right_on='address')

#insert fire_company_id
fire_companies_df.insert(0, 'fire_company_id', range(1, 1 + len(fire_companies_df)))

#extract only necessary columns per database schema
fire_companies_df=fire_companies_df[['fire_company_id','fire_company','address_id']]

#rename columns per database schema
fire_companies_df.rename(columns={'address_id':'fire_comp_address_id'},inplace=True)

#inject into sql database
fire_companies_df.to_sql(name='fire_companies', con=engine, if_exists='append', index=False)

print('Fire Companies Table Loaded')

Fire Companies Table Loaded


### Police Precincts Table

This table describes the police stations/precincts near the property. The original dataset only included the name of the police precinct, but additional information was added to make this table useful for further analysis.
The steps to create this table include:

In [20]:
#create temp df
temp_pol=df[['police_precinct','police_precinct_address','phone_number']]

#drop duplicates
temp_pol=temp_pol.drop_duplicates(keep='first')

#drop nas
temp_pol=temp_pol.dropna()
police_precincts_df=temp_pol

#add police precinct address id by left joining with address_df table
police_precincts_df = pd.merge(police_precincts_df, address_df, how='left',
                         left_on='police_precinct_address', right_on='address')

#insert police_precinct_id
police_precincts_df.insert(0, 'police_precinct_id', range(1, 1 + len(police_precincts_df)))

#extract only relevant columns
police_precincts_df=police_precincts_df[['police_precinct_id','police_precinct','address_id','phone_number']]

#rename columns per database schema
police_precincts_df.rename(columns={'address_id':'police_prct_address_id'},inplace=True)

#turn datatype of police_precinct to numeric
police_precincts_df["police_precinct"] = police_precincts_df["police_precinct"].astype(np.int64)

#inject into sql database
police_precincts_df.to_sql(name='police_precincts', con=engine, if_exists='append', index=False)

print('Police Precincts Table Loaded')

Police Precincts Table Loaded


### Properties Table

This table is the describes all the features of each property. The following steps were taken to create this table:

In [22]:
#drop nas in original dataset across all columns
df=df.dropna(how='all')

#drop duplicates
df=df.drop_duplicates(keep='first')

#extract necessary columns to be added
df2=df[['address','neighborhood','building_class_category',
                  'building_class','tax_class','CD','Council',
                  'SchoolDist','FireComp','PolicePrct','land_sqft','total_units',
                  'residential_units','commercial_units','block','lot',
                  'sale_price','sale_date','year_of_sale',
                  'apartment_number','gross_sqft','OwnerName']]

#left join with other created tables to get foreign keys for properties table

#neighborhood_id
df3 = pd.merge(df2, neighborhood_df, how='left',
                         left_on='neighborhood', right_on='neighborhood')
#address_id
df4=pd.merge(df3, address_df, how='left',
                        left_on='address', right_on='address')

#building_class_category_id
df5=pd.merge(df4, building_class_category_df, how='left',
                        left_on='building_class_category', right_on='building_class_category')

#building_class_id
df6=pd.merge(df5, building_class_df, how='left',
                        left_on='building_class', right_on='building_class')

#tax_class_id
df7=pd.merge(df6, tax_class_df, how='left',
                        left_on='tax_class', right_on='tax_class')

#community_district_id
df8=pd.merge(df7, community_district_df, how='left',
                    left_on='CD', right_on='community_district')

#city_council_id
df9=pd.merge(df8, council_districts_df, how='left',
                        left_on='Council', right_on='city_council_district')

#school_district_id
df10=pd.merge(df9, school_districts_df, how='left',
                        left_on='SchoolDist', right_on='school_district')

#fire_company_id
df11=pd.merge(df10, fire_companies_df, how='left',
                        left_on='FireComp', right_on='fire_company')


#police_precinct_id
df12=pd.merge(df11, police_precincts_df, how='left',
                        left_on='PolicePrct', right_on='police_precinct')

#add property_id
df12.insert(0, 'property_id', range(1, 1 + len(df12)))

#extract only necessary columns
property_df=df12[['property_id','address_id', 'neighborhood_id', 'building_class_category_id',
                  'building_class_id','tax_class_id','community_district_id','city_council_district_id',
                  'school_district_id','fire_company_id','police_precinct_id','land_sqft','total_units',
                  'residential_units','commercial_units','block','lot']]

In [23]:
#inspect the new property df
property_df.head()

Unnamed: 0,property_id,address_id,neighborhood_id,building_class_category_id,building_class_id,tax_class_id,community_district_id,city_council_district_id,school_district_id,fire_company_id,police_precinct_id,land_sqft,total_units,residential_units,commercial_units,block,lot
0,1,1.0,1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20267,1,0,1,230,1
1,2,2.0,1,2,2.0,1.0,1.0,1.0,1.0,1.0,1.0,12500,102,0,102,250,44
2,3,3.0,2,3,3.0,1.0,2.0,1.0,2.0,,2.0,60400,0,0,0,2287,16
3,4,4.0,3,2,4.0,1.0,1.0,1.0,1.0,,1.0,20704,2,0,2,63,1
4,5,5.0,3,1,1.0,1.0,1.0,1.0,1.0,,1.0,21175,1,0,1,87,9


In [25]:
#rename columns to match the schema
property_df.rename(columns={'residential_units':'res_units','commercial_units':'comm_units'},inplace=True)

#inject into sql database
property_df.to_sql(name='properties', con=engine, if_exists='append', index=False)

print('Properties Table Loaded')

Properties Table Loaded


### Property Sales Table

This table describes every property trade/sale. The following steps were taken to create this table:

In [26]:
#create a temp df from the df that created property table
temp_sales=df12[['property_id','sale_price','sale_date','year_of_sale','tax_class',
               'apartment_number','gross_sqft']]

#left join with tax_class df to get tax_class_id for each property_sale
sales_df=pd.merge(temp_sales, tax_class_df, how='left',
                 left_on='tax_class', right_on='tax_class')

#extract relevant columns only
sales_df=sales_df[['property_id','sale_price','sale_date','year_of_sale','tax_class_id',
               'apartment_number','gross_sqft']]

#insert sale_id
sales_df.insert(0, 'sale_id', range(1, 1 + len(sales_df)))

#inject into sql database
sales_df.to_sql(name='property_sales', con=engine, if_exists='append', index=False)

print('Property Sales Table Loaded')

Property Sales Table Loaded


### Owns Table

This table describes the ownership relation between owners and properties. The steps to create this table include:

In [27]:
#create temp df from df used to create property table to get property_id information
owns_df=df12[['OwnerName','property_id']]

#left join with owners_df table to get owner_ids
owns_df=pd.merge(owns_df, owners_df, how='left',
                left_on='OwnerName', right_on='owner_name')

#extract only relevant columns
owns_df=owns_df[['owner_id','property_id']].dropna()

#inject into sql database
owns_df.to_sql(name='owns', con=engine, if_exists='append', index=False)

print('Owns Table Loaded')

Owns Table Loaded
