In [None]:
import pandas as pd
import numpy as np

import pymysql
from sqlalchemy import create_engine
import getpass  # To get the password without showing the input

from dotenv import load_dotenv
import os

load_dotenv()

import functions as f

# Access the environment variables from the .env file
password = os.environ.get('DB_PASS')


# Note that when you use _SQLAlchemy_ and establish the connection, you do not even need to be logged in Sequel Pro or MySQL Workbench.

## Connect to DB

In [81]:
# password = getpass.getpass()

bd = "building_permits"
connection_string = 'mysql+pymysql://root:' + password + '@localhost/'+bd
engine = create_engine(connection_string)
engine

Engine(mysql+pymysql://root:***@localhost/building_permits)

## Load Dataframe

In [82]:
df = pd.read_csv("../data/clean/building_permits_addition_alteration_clean.csv")
display(df.head(1))

Unnamed: 0,record_number,address,latitude,longitude,status,issue_date,number_of_units,current_property_use,building_cost,electrical_cost,...,planning_board_special_permit,bicycle_parking_change,issue_year,issue_month,season,calc_total_cost,total_cost_bins,original_firm_name,standardized_firm_name,keywords
0,0,"304 Vassar St, Cambridge, MA 02139",42.354803,-71.104827,Active,2020-01-07,0.0,Commercial/Mixed,310000.0,85000.0,...,False,False,2020,1,Winter,424600.0,high,Other,Other,"['repair build finishes out', 'complete interi..."


## Seed Data

### building_construction_types

In [83]:
building_construction_types_unique = df["building_construction_type"].unique()
print(building_construction_types_unique)

def assign_index(row):
    return list(building_construction_types_unique).index(row["building_construction_type"])

df["building_construction_type_id"] = df.apply(assign_index, axis=1)

['Steel/Concrete, Non-Combustible (Type I/II)' 'Wood (Type (IV/V)'
 'Masonry / Wood (Type III)']


In [84]:
building_construction_types_df = pd.DataFrame(df[["building_construction_type_id", "building_construction_type"]])
building_construction_types_df.drop_duplicates(inplace=True)
building_construction_types_df.rename(columns={'building_construction_type_id': 'id', 'building_construction_type': 'type'}, inplace=True)
building_construction_types_df

Unnamed: 0,id,type
0,0,"Steel/Concrete, Non-Combustible (Type I/II)"
3,1,Wood (Type (IV/V)
4,2,Masonry / Wood (Type III)


In [85]:
with engine.connect() as connection:
    building_construction_types_df.to_sql("building_construction_type", connection, index=False, if_exists="replace")

In [86]:
df["building_construction_type"] = df["building_construction_type_id"]
df.drop("building_construction_type_id", axis=1, inplace=True)

### Function to streamline preparing df for seeding

In [87]:
# def prepare_for_seeding(df, column, db_column):
#     uniques = df[column].unique()
#     print(f"Unique values for {column}:", uniques)

#     def assign_index(row):
#         return list(uniques).index(row[column])

#     df[f"{column}_id"] = df.apply(assign_index, axis=1)
#     display(f"{column}_id", df[[f"{column}_id", column]])
#     prepared_df = pd.DataFrame(df[[f"{column}_id", f"{column}"]])
#     prepared_df.drop_duplicates(inplace=True)
#     prepared_df.rename(columns={f"{column}_id": 'id', column: db_column}, inplace=True)
        
#     return df, prepared_df

### building_use

In [88]:
df, building_use_df = f.prepare_for_seeding(df, "building_use", "use")

Unique values for building_use: ['Commercial/Mixed' 'Multi-Family' 'One/Two-Family' 'Townhouse']
building_use_id        building_use_id      building_use
0                    0  Commercial/Mixed
1                    0  Commercial/Mixed
2                    0  Commercial/Mixed
3                    1      Multi-Family
4                    1      Multi-Family
...                ...               ...
11455                0  Commercial/Mixed
11456                0  Commercial/Mixed
11457                0  Commercial/Mixed
11458                2    One/Two-Family
11459                1      Multi-Family

[11460 rows x 2 columns]


In [89]:
with engine.connect() as connection:
    building_use_df.to_sql("building_use", connection, index=False, if_exists="replace")

#### also for current_property_use

In [90]:
building_use_unique = df["building_use"].unique()
print(building_use_unique)

def assign_index(row):
    return list(building_use_unique).index(row["current_property_use"])

df["current_property_use_id"] = df.apply(assign_index, axis=1)


['Commercial/Mixed' 'Multi-Family' 'One/Two-Family' 'Townhouse']


In [91]:
df["building_use"] = df["building_use_id"]
df.drop("building_use_id", axis=1, inplace=True)

In [92]:
df["current_property_use"] = df["current_property_use_id"]
df.drop("current_property_use_id", axis=1, inplace=True)

### season

In [93]:
df, season_df = f.prepare_for_seeding(df, "season", "name")

Unique values for season: ['Winter' 'Summer' 'Autumn' 'Spring']
season_id        season_id  season
0              0  Winter
1              0  Winter
2              0  Winter
3              1  Summer
4              1  Summer
...          ...     ...
11455          2  Autumn
11456          0  Winter
11457          0  Winter
11458          0  Winter
11459          2  Autumn

[11460 rows x 2 columns]


In [94]:
with engine.connect() as connection:
    season_df.to_sql("season", connection, index=False, if_exists="replace")

In [95]:
df["season"] = df["season_id"]
df.drop("season_id", axis=1, inplace=True)

### firm

In [96]:
df, firm_df = f.prepare_for_seeding(df, "standardized_firm_name", "standardized_firm_name")

with engine.connect() as connection:
    firm_df.to_sql("firm", connection, index=False, if_exists="replace")

Unique values for standardized_firm_name: ['Other' 'TC Building Incorporated' 'albert m stuart' ...
 'Colonetti Exterior Construction' 'Longden Company Inc'
 'McCourt Construction Company Inc.']
standardized_firm_name_id        standardized_firm_name_id                    standardized_firm_name
0                              0                                     Other
1                              1                  TC Building Incorporated
2                              2                           albert m stuart
3                              3            Steve E. Valenti Builders Inc.
4                              4                 Jason Du Construction Co.
...                          ...                                       ...
11455                        222  Architectural Building & Restoration LLC
11456                       2153         McCourt Construction Company Inc.
11457                        789                      BNA CONTRACTING INC.
11458                        

In [97]:
df.columns

Index(['record_number', 'address', 'latitude', 'longitude', 'status',
       'issue_date', 'number_of_units', 'current_property_use',
       'building_cost', 'electrical_cost', 'plumbing_cost', 'gas_cost',
       'hvac_cost', 'fire_prevention_cost', 'description', 'isd_description',
       'size_of_new_addition', 'change_in_floor_area_or_dimensions',
       'change_in_exterior', 'discharge_to_sewer_or_storm_water_system',
       'new_or_replaced_storm_sewer', 'construction_dewatering',
       'public_right-of-way', 'basement_plumbing_fixture',
       'change_in_at_least_half_of_total_area', 'firm_name', 'debris_disposal',
       'description_of_demolition', 'method_of_removal', 'type_of_demolition',
       'building_use', 'condo_association', 'building_construction_type',
       'bza_case', 'planning_board_special_permit', 'bicycle_parking_change',
       'issue_year', 'issue_month', 'season', 'calc_total_cost',
       'total_cost_bins', 'original_firm_name', 'standardized_firm_name',


In [98]:
df["standardized_firm_name"] = df["standardized_firm_name_id"]
df.drop(columns=["firm_name", "standardized_firm_name_id", "original_firm_name"], inplace=True)

### total_cost_bins

In [99]:
df, total_cost_bins_df = f.prepare_for_seeding(df, "total_cost_bins", "bin_name")

with engine.connect() as connection:
    total_cost_bins_df.to_sql("total_cost_bins", connection, index=False, if_exists="replace")

Unique values for total_cost_bins: ['high' 'low' 'medium' 'very high']
total_cost_bins_id        total_cost_bins_id total_cost_bins
0                       0            high
1                       1             low
2                       2          medium
3                       1             low
4                       2          medium
...                   ...             ...
11455                   0            high
11456                   3       very high
11457                   1             low
11458                   2          medium
11459                   2          medium

[11460 rows x 2 columns]


In [100]:
df["total_cost_bins"] = df["total_cost_bins_id"]
df.drop("total_cost_bins_id", axis=1, inplace=True)

### costs

In [101]:
df_costs = df[[
    "record_number",
    "building_cost", 
    "electrical_cost", 
    "plumbing_cost",
    "gas_cost",
    "hvac_cost",
    "fire_prevention_cost", 
    "calc_total_cost"
]]

with engine.connect() as connection:
    df_costs.to_sql("costs", connection, index=False, if_exists="replace")

df_costs

Unnamed: 0,record_number,building_cost,electrical_cost,plumbing_cost,gas_cost,hvac_cost,fire_prevention_cost,calc_total_cost
0,0,310000.0,85000.0,0.0,0.0,29600.0,0.0,424600.0
1,1,10000.0,2000.0,1000.0,0.0,0.0,3500.0,16500.0
2,2,34800.0,0.0,0.0,0.0,0.0,0.0,34800.0
3,3,4000.0,0.0,0.0,0.0,0.0,0.0,4000.0
4,4,55000.0,0.0,0.0,0.0,0.0,0.0,55000.0
...,...,...,...,...,...,...,...,...
11455,11873,101000.0,300000.0,160000.0,0.0,25000.0,0.0,586000.0
11456,11874,3186000.0,214000.0,0.0,0.0,0.0,0.0,3400000.0
11457,11875,9350.0,0.0,0.0,0.0,0.0,850.0,10200.0
11458,11876,24551.0,1800.0,6550.0,0.0,0.0,0.0,32901.0


In [102]:
df.drop(columns=[
    "building_cost", 
    "electrical_cost", 
    "plumbing_cost",
    "gas_cost",
    "hvac_cost",
    "fire_prevention_cost", 
    "calc_total_cost"
    ], inplace=True)

df

Unnamed: 0,record_number,address,latitude,longitude,status,issue_date,number_of_units,current_property_use,description,isd_description,...,building_construction_type,bza_case,planning_board_special_permit,bicycle_parking_change,issue_year,issue_month,season,total_cost_bins,standardized_firm_name,keywords
0,0,"304 Vassar St, Cambridge, MA 02139",42.354803,-71.104827,Active,2020-01-07,0.0,0,Complete interior and limited exterior renovat...,Interior Renovation - partial first and second...,...,0,False,False,False,2020,1,0,0,0,"['repair build finishes out', 'complete interi..."
1,1,"174 Alewife Brook Pkwy, Cambridge, MA 02138",42.390588,-71.140620,Active,2019-12-23,0.0,0,demo of interior walls / carpet / tile and ACT,Demolition of interior walls and finishes,...,0,False,False,False,2019,12,0,1,1,['interior wall']
2,2,"1493 Cambridge St, Cambridge, MA 02139",42.374425,-71.104437,Active,2020-01-10,0.0,0,Replacement or 7-fire door due to damaged door...,Replace (7) fire rated doors with like-kind ra...,...,0,False,False,False,2020,1,0,2,2,"['fire door', 'damaged door', 'fire door']"
3,3,"151 Magazine St, Cambridge, MA 02139",42.358092,-71.112429,Complete,2019-08-12,3.0,1,Remove and replace front porch stair and stair...,Reconstruct front steps and rail,...,1,False,False,False,2019,8,1,1,3,"['replace front porch rail stair', 'porch stai..."
4,4,"292 Columbia St, Unit 2, Cambridge, MA 02141",42.370272,-71.095866,Active,2019-08-29,3.0,1,"Kitchen renovation, wall finishing and paint...",Kitchen renovation,...,2,False,False,False,2019,8,1,2,4,"['kitchen renovation', 'wall finishing', 'kitc..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11455,11873,"105 Broadway, Cambridge, MA 02142",42.364383,-71.087858,Active,2019-11-20,0.0,0,Replacement of existing penthouse emergency ge...,Replacement of existing emergency generator on...,...,0,False,False,False,2019,11,2,0,222,"['penthouse emergency', 'emergency generator']"
11456,11874,"377 Putnam Ave, Cambridge, Ma 02139",42.368296,-71.101828,Active,2019-12-26,0.0,0,Installation of concrete foundations for new t...,Foundations for new transformers; fencing grea...,...,0,True,True,False,2019,12,0,3,2153,"['concrete foundation', 'new transformer', 'or..."
11457,11875,"9 Brookline St, Cambridge, MA 02139",42.363788,-71.101715,Active,2020-01-13,0.0,0,install a metal stud wall wityh 5/8 fire code ...,Install a metal stud wall with 5/8 fire code f...,...,2,False,False,False,2020,1,0,1,789,"['install metal stud wall', 'replace kitchen f..."
11458,11876,"99 Magazine St, Unit 1, Cambridge, MA 02139",42.360474,-71.109815,Complete,2019-12-10,2.0,2,Renovate existing bathroom. New plumbing fixtu...,Not Specified,...,1,False,False,False,2019,12,0,2,228,"['exist bathroom', 'new plumbing fixture', 'lo..."


### property

In [103]:
# df_property = df[["record_number", "address", "latitude", "longitude"]]

# df_property.duplicated(subset="address").value_counts()

In [104]:
property_unique = df["address"].unique()
print("Unique properties:", df["address"].nunique())

def assign_index(row):
    return list(property_unique).index(row["address"])

df["property"] = df.apply(assign_index, axis=1)

property_df = pd.DataFrame(df[["property", "address", "latitude", "longitude"]])
property_df.drop_duplicates(inplace=True)
property_df.rename(columns={'property': 'id'}, inplace=True)
property_df

Unique properties: 7619


Unnamed: 0,id,address,latitude,longitude
0,0,"304 Vassar St, Cambridge, MA 02139",42.354803,-71.104827
1,1,"174 Alewife Brook Pkwy, Cambridge, MA 02138",42.390588,-71.140620
2,2,"1493 Cambridge St, Cambridge, MA 02139",42.374425,-71.104437
3,3,"151 Magazine St, Cambridge, MA 02139",42.358092,-71.112429
4,4,"292 Columbia St, Unit 2, Cambridge, MA 02141",42.370272,-71.095866
...,...,...,...,...
11451,7614,"60 Webster Ave, Unit 3, Cambridge, MA 02141",42.368196,-71.092925
11456,7615,"377 Putnam Ave, Cambridge, Ma 02139",42.368296,-71.101828
11457,7616,"9 Brookline St, Cambridge, MA 02139",42.363788,-71.101715
11458,7617,"99 Magazine St, Unit 1, Cambridge, MA 02139",42.360474,-71.109815


In [105]:
with engine.connect() as connection:
    property_df.to_sql("property", connection, index=False, if_exists="replace")

In [106]:
df_property = df[[
    "record_number",
    "address", 
    "latitude", 
    "longitude"
]]

df.drop(columns=[
    "address", 
    "latitude", 
    "longitude"
    ], inplace=True)

df

Unnamed: 0,record_number,status,issue_date,number_of_units,current_property_use,description,isd_description,size_of_new_addition,change_in_floor_area_or_dimensions,change_in_exterior,...,bza_case,planning_board_special_permit,bicycle_parking_change,issue_year,issue_month,season,total_cost_bins,standardized_firm_name,keywords,property
0,0,Active,2020-01-07,0.0,0,Complete interior and limited exterior renovat...,Interior Renovation - partial first and second...,0.0,False,False,...,False,False,False,2020,1,0,0,0,"['repair build finishes out', 'complete interi...",0
1,1,Active,2019-12-23,0.0,0,demo of interior walls / carpet / tile and ACT,Demolition of interior walls and finishes,0.0,False,False,...,False,False,False,2019,12,0,1,1,['interior wall'],1
2,2,Active,2020-01-10,0.0,0,Replacement or 7-fire door due to damaged door...,Replace (7) fire rated doors with like-kind ra...,0.0,False,False,...,False,False,False,2020,1,0,2,2,"['fire door', 'damaged door', 'fire door']",2
3,3,Complete,2019-08-12,3.0,1,Remove and replace front porch stair and stair...,Reconstruct front steps and rail,0.0,False,False,...,False,False,False,2019,8,1,1,3,"['replace front porch rail stair', 'porch stai...",3
4,4,Active,2019-08-29,3.0,1,"Kitchen renovation, wall finishing and paint...",Kitchen renovation,0.0,False,False,...,False,False,False,2019,8,1,2,4,"['kitchen renovation', 'wall finishing', 'kitc...",4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11455,11873,Active,2019-11-20,0.0,0,Replacement of existing penthouse emergency ge...,Replacement of existing emergency generator on...,0.0,False,False,...,False,False,False,2019,11,2,0,222,"['penthouse emergency', 'emergency generator']",1861
11456,11874,Active,2019-12-26,0.0,0,Installation of concrete foundations for new t...,Foundations for new transformers; fencing grea...,2770.0,False,False,...,True,True,False,2019,12,0,3,2153,"['concrete foundation', 'new transformer', 'or...",7615
11457,11875,Active,2020-01-13,0.0,0,install a metal stud wall wityh 5/8 fire code ...,Install a metal stud wall with 5/8 fire code f...,0.0,False,False,...,False,False,False,2020,1,0,1,789,"['install metal stud wall', 'replace kitchen f...",7616
11458,11876,Complete,2019-12-10,2.0,2,Renovate existing bathroom. New plumbing fixtu...,Not Specified,0.0,False,False,...,False,False,False,2019,12,0,2,228,"['exist bathroom', 'new plumbing fixture', 'lo...",7617


### status

In [107]:
df, df_status = f.prepare_for_seeding(df, "status", "value")

with engine.connect() as connection:
    df_status.to_sql("status", connection, index=False, if_exists="replace")

Unique values for status: ['Active' 'Complete']
status_id        status_id    status
0              0    Active
1              0    Active
2              0    Active
3              1  Complete
4              0    Active
...          ...       ...
11455          0    Active
11456          0    Active
11457          0    Active
11458          1  Complete
11459          1  Complete

[11460 rows x 2 columns]


In [108]:
df["status"] = df["status_id"]
df.drop("status_id", axis=1, inplace=True)

### keywords

In [109]:
df_keyword_data = pd.read_csv("../data/clean/keyword_data.csv")
df_keyword_data

Unnamed: 0,record_number,keyword,frequency
0,0,repair build finishes out,1
1,0,complete interior,3
2,0,limited exterior renovation,1
3,0,story masonry,1
4,0,masonry building,3
...,...,...,...
70707,11877,1st floor,343
70708,11877,1st floor,343
70709,11877,porch,76
70710,11877,exist dimension,3


In [110]:
df_unique_keywords = pd.read_csv("../data/clean/unique_keywords.csv")
df_unique_keywords

Unnamed: 0,keyword,frequency
0,air sealing,782
1,insulation work,599
2,structural change,561
3,site id,477
4,2nd floor,378
...,...,...
25962,replace reinstall rails,1
25963,install interior trim doors,1
25964,install access wells windows,1
25965,install heat sump pump,1


In [111]:
df_unique_keywords.reset_index(inplace=True)
df_unique_keywords.rename(columns={"index": "keyword_id"}, inplace=True)

In [112]:

df_keyword_merged = df_keyword_data.merge(df_unique_keywords[["keyword_id", "keyword"]], on="keyword")
df_keyword_merged

Unnamed: 0,record_number,keyword,frequency,keyword_id
0,0,repair build finishes out,1,8881
1,0,complete interior,3,3668
2,0,limited exterior renovation,1,19423
3,0,story masonry,1,19422
4,0,masonry building,3,3179
...,...,...,...,...
70707,11877,1st floor,343,8
70708,11877,1st floor,343,8
70709,11877,porch,76,74
70710,11877,exist dimension,3,3352


In [113]:
df_keyword_sql = df_keyword_merged[["record_number", "keyword_id", "frequency"]]

with engine.connect() as connection:
    df_keyword_sql.to_sql("record_keyword", connection, if_exists="replace")

In [114]:
df_unique_keywords.rename(columns={'keyword_id': 'id'}, inplace=True)

with engine.connect() as connection:
    df_unique_keywords.to_sql("keyword", connection, index=False, if_exists="replace")

### Main Dataframe (record)

In [115]:
# df_merged = df.merge(df_keyword_sql[["record_number", "keyword_id"]], on="record_number")

In [116]:
df.columns

Index(['record_number', 'status', 'issue_date', 'number_of_units',
       'current_property_use', 'description', 'isd_description',
       'size_of_new_addition', 'change_in_floor_area_or_dimensions',
       'change_in_exterior', 'discharge_to_sewer_or_storm_water_system',
       'new_or_replaced_storm_sewer', 'construction_dewatering',
       'public_right-of-way', 'basement_plumbing_fixture',
       'change_in_at_least_half_of_total_area', 'debris_disposal',
       'description_of_demolition', 'method_of_removal', 'type_of_demolition',
       'building_use', 'condo_association', 'building_construction_type',
       'bza_case', 'planning_board_special_permit', 'bicycle_parking_change',
       'issue_year', 'issue_month', 'season', 'total_cost_bins',
       'standardized_firm_name', 'keywords', 'property'],
      dtype='object')

In [117]:
df.drop(columns=[
    'keywords',
    ], inplace=True)

In [118]:
with engine.connect() as connection:
    df.to_sql("record", connection, index=False, if_exists="replace")