In [12]:
import pandas as pd
import mysql.connector as mycon
import kagglehub
from kagglehub import KaggleDatasetAdapter

import pymysql
import sqlalchemy 
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")

# This pulls in the wildfire data and reads it

In [9]:
use_cols = ['OBJECTID', 'FIRE_NAME', 'LATITUDE', 'LONGITUDE', 'DISCOVERY_DATE', 
            'NWCG_CAUSE_CLASSIFICATION', 'NWCG_GENERAL_CAUSE', 'FIRE_SIZE', 
            'FIRE_SIZE_CLASS', 'OWNER_DESCR', 'STATE', 'FIPS_NAME']

# Download latest version
wildfire_df = kagglehub.load_dataset(
    handle="behroozsohrabi/us-wildfire-records-6th-edition", 
    path="data.csv", 
    adapter=KaggleDatasetAdapter.PANDAS,  # Ensure adapter is correct
    pandas_kwargs={"usecols": use_cols}  # No compression!
)

print(wildfire_df.head())


  result = read_function(


   OBJECTID FIRE_NAME DISCOVERY_DATE NWCG_CAUSE_CLASSIFICATION  \
0         1  FOUNTAIN       2/2/2005                     Human   
1         2    PIGEON      5/12/2004                   Natural   
2         3     SLACK      5/31/2004                     Human   
3         4      DEER      6/28/2004                   Natural   
4         5  STEVENOT      6/28/2004                   Natural   

                           NWCG_GENERAL_CAUSE  FIRE_SIZE FIRE_SIZE_CLASS  \
0  Power generation/transmission/distribution       0.10               A   
1                                     Natural       0.25               A   
2                     Debris and open burning       0.10               A   
3                                     Natural       0.10               A   
4                                     Natural       0.10               A   

    LATITUDE   LONGITUDE       OWNER_DESCR STATE         FIPS_NAME  
0  40.036944 -121.005833              USFS    CA     Plumas County  
1  38.93

In [10]:
# wildfire_df = pd.read_csv('data.csv')
# Convert DISCOVERY_DATE to datetime object to remove records prior to year 2000

wildfire_df['DISCOVERY_DATE'] = pd.to_datetime(wildfire_df['DISCOVERY_DATE'], format = ('%m/%d/%Y'))

wildfire_df = wildfire_df[wildfire_df['DISCOVERY_DATE'].dt.year > 2000]

In [11]:
# Rename some columns for clarity

wildfire_df = wildfire_df.rename(columns = {"NWCG_CAUSE_CLASSIFICATION": "CAUSE_CLASSIFICATION", "NWCG_GENERAL_CAUSE": "SPECIFIC_CAUSE", 
                                            "OWNER_DESCR": "RESPONSIBLE_ENTITY", "FIPS_NAME": "COUNTY"})

# This pulls in housing data

In [10]:
bottom_tier_housing = pd.read_parquet('data/bottom_housing.parquet')
top_tier_housing = pd.read_parquet('data/top_housing.parquet')

## Remove Columns we don't use

In [11]:
remove_cols = ['RegionID', 'RegionType', 'StateName', 'Metro', 'SizeRank']

bottom_tier_housing.drop(columns = remove_cols, inplace = True)
top_tier_housing.drop(columns = remove_cols, inplace = True)

## Remove rows with states we aren't using in a way that modifies the original dataframe in-place

In [12]:
states = ['CA', 'TX', 'GA', 'FL', 'AZ']

bottom_tier_housing.loc[~bottom_tier_housing['State'].isin(states), :] = None
top_tier_housing.loc[~top_tier_housing['State'].isin(states), :] = None

bottom_tier_housing.dropna(inplace = True)
top_tier_housing.dropna(inplace = True)


## Combine the data frames and pivot long the dates and prices

In [13]:
housing_df = pd.concat([bottom_tier_housing, top_tier_housing], axis = 0, ignore_index = True)

In [14]:
housing_df = housing_df.melt(id_vars = ['RegionName', 'State', 'CountyName'], var_name = 'Date', value_name = 'Price')
# housing_df['Price'] = housing_df['Price'].astype('float').round(2).map('{:.2f}'.format)
housing_df['Date'] = pd.to_datetime(housing_df['Date'])

## Add price_id for each record

In [15]:
housing_df.insert(0, 'PRICE_ID', range(1, len(housing_df) + 1))

## Map states to FIPS code

In [16]:
map = {'CA': 6, 'AZ': 4, 'TX': 48, 'FL': 12, 'GA': 13}
housing_df['STATE_ID'] = housing_df['State'].map(map)
# housing_df.drop(columns = 'State', inplace = True)

# Rent Data (load and remove columns'state values)

In [17]:
rent_index = pd.read_csv("data/Observed Rent Index by City.csv")

In [18]:
# Columns and state values to remove are same as housing sales data

rent_index.drop(columns = remove_cols, inplace = True)
rent_index.loc[~rent_index['State'].isin(states), :] = None

rent_index.dropna(inplace = True)

## Pivot long the date and price data

In [19]:
rent_index = rent_index.melt(id_vars = ['RegionName', 'State', 'CountyName'], var_name = 'Date', value_name = 'Price')
rent_index['Date'] = pd.to_datetime(housing_df['Date'])

## Create price index

In [20]:
rent_index.insert(0, 'RENT_ID', range(1, len(rent_index) + 1))

## Map state codes

In [21]:
rent_index['STATE_ID'] = rent_index['State'].map(map)
# housing_df.drop(columns = 'State', inplace = True)

# Load Population Data into dictionary

In [22]:
state_population_dict = {}

for state in states:
    state_population_dict[state] = pd.read_csv(f"data/{state} City population estimates.csv")

In [23]:
columns_to_drop = ['SUMLEV', 'COUSUB', 'CONCIT', 'PRIMGEO_FLAG', 'FUNCSTAT', 'STNAME', 'ESTIMATESBASE2010', 'POPESTIMATE2010', 'STATE', 'COUNTY', 'PLACE']

## Iterate through df's and create PLACES_ID column and drop unnecessary columns

In [24]:
for key, value in state_population_dict.items():
    df = state_population_dict[f"{key}"]
    df['PLACES_ID'] = df[['STATE', 'COUNTY', 'PLACE']].astype('str').agg('-'.join, axis = 1)
    df.drop(columns = columns_to_drop, inplace = True)
    df.insert(0, 'PLACES_ID', df.pop('PLACES_ID'))

# MySql server connection 

In [15]:
# Connection details
userName = "erosales"
userPass = "Password1"  # Replace with your actual password
server = "safehaven1.mysql.database.azure.com"
database = "sh-1"

# Path to the Azure SSL Certificate (Download it if you haven't)
ssl_cert_path = "DigiCertGlobalRootCA.crt.pem"  # Ensure this file exists in your working directory

# Establish the secure connection
conn = pymysql.connect(
    host=server,
    port=3306,
    user=userName,
    passwd=userPass,
    db=database,
    ssl={"ssl": {"ca": ssl_cert_path}}  # Using SSL
)

print("Connected successfully!")


Connected successfully!
