## Zillow Top Tier Housing Data


In this project we will look at the top tier home values across all 50 states. 

We will be using Zillow's ZHVI Home Value Index. It is a measure of typical home value and market changes across a given region and housing type. It reflects the value for homes in the 35th to 65th percentile range.

Zillow publishes top-tier ZHVI ($, typical value for homes within the 65th to 95th percentile range for a given region) and bottom-tier ZHVI.

A user guide for this data can be found at: [Zillow](https://www.zillow.com/research/zhvi-user-guide/). 

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlite3
import csv 
import numpy as np 
import warnings 
from pandasql import sqldf
import geopandas as gpd 
# Ignore all warnings 
warnings.filterwarnings('ignore')


# Load your data into a DataFrame (assuming it's in a CSV file)

csv_file = r"C:\Users\Wolfrank\Desktop\Zillow.csv"
data = pd.read_csv(csv_file)
df = data


ModuleNotFoundError: No module named 'geopandas'

### Cleaning the Data 


In [None]:

# Next we clean up the data in the dataframe we just created, then we save the new file.

# Data Cleaning:  Drop Columns 
columns_to_remove = ['RegionID', 'RegionType', 'StateName',]
data.drop(columns=columns_to_remove, inplace=True)

# Rename Column from RegionName to State
df.rename(columns={'RegionName': 'State'}, inplace=True)

# Add 1 to Index 
df.index = df.index + 1

# Add 1 to SizeRank 
df['SizeRank'] = df['SizeRank'] + 1

# Show number of columns
num_columns = len(data.columns)
print("Number of columns:", num_columns)

display(df)

In [None]:
# Get the column names and join them with a comma
column_names = ','.join(df.columns)

print(column_names)


In [None]:

# Load the CSV data into a SQLite database
con = sqlite3.connect('zillow_data.db')

df.to_sql('zillow_data', con, if_exists='replace', index=False)

# Define SQL queries for analysis
query1 = """
SELECT State, AVG("7/31/2019") AS AvgHomeValue
FROM zillow_data
GROUP BY State
"""

query2 =  """
SELECT State, AVG("7/31/2023") AS AvgHomeValue
FROM zillow_data
GROUP BY State
"""

query3 = """
SELECT State, AVG("7/31/2023") AS AvgHomeValue
FROM zillow_data
GROUP BY State
ORDER BY AvgHomeValue DESC
LIMIT 5
"""

query4 = """
SELECT State, ("7/31/2023" - "7/31/2019") AS ChangeInValue
FROM zillow_data
ORDER BY ChangeInValue DESC
LIMIT 5
"""

query5 = """
SELECT State, "7/31/2023" AS HomeValue
FROM zillow_data
WHERE "7/31/2023" = (SELECT MAX("7/31/2023") FROM zillow_data)
"""


# Execute queries and fetch results
query_results = []

for query in [query1, query2, query3, query4, query5]:
    result = pd.read_sql_query(query, con)
    query_results.append(result)

# Create Seaborn charts
sns.set(style="darkgrid")

# Chart 1: Average home values by state in September 2021
plt.figure(figsize=(12, 6))
sns.barplot(data=query_results[1], x='AvgHomeValue', y='State')
plt.title('Average Home Values by State in July 2019')
plt.xlabel('Average Home Value')
plt.ylabel('State')

# Chart 2: Average home values by state in September 2021
plt.figure(figsize=(12, 6))
sns.barplot(data=query_results[0], x='AvgHomeValue', y='State')
plt.title('Average Home Values by State in July 2023')
plt.xlabel('Average Home Value')
plt.ylabel('State')

# Chart 3: Top 5 states with the highest average home values in May 2023
plt.figure(figsize=(12, 6))
sns.barplot(data=query_results[2], x='AvgHomeValue', y='State')
plt.title('Top 5 States with the Highest Average Home Values in July 2023')
plt.xlabel('Average Home Value')
plt.ylabel('State')

#Chart 4:  Execute the query and fetch results
result4 = pd.read_sql_query(query4, con)
# Create Seaborn histogram chart
sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))
sns.barplot(data=result4, x='ChangeInValue', y='State', palette='viridis')
plt.title('Top 5 States with Highest Change in Home Values (Jul 2019 to Jul 2023)')
plt.xlabel('Change in Home Value')
plt.ylabel('State')

# Chart 5: Monthly trend of home values for the top-ranked state in May 2023
# Execute the query and fetch the result
result5 = pd.read_sql_query(query5, con)
# Load the US shapefile for creating the map
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# Filter the shapefile to contain only the United States
us_shapefile = world[world['name'] == 'United States']
# Merge the shapefile with the result to highlight the state
us_shapefile = us_shapefile.merge(result5, left_on='name', right_on='State', how='left')
# Create the map
fig, ax = plt.subplots(figsize=(12, 8))
us_shapefile.plot(column='HomeValue', cmap='Blues', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)
ax.set_title('State with Highest Home Value (Jul 2023)')


# Show the charts
plt.show()

# Close the database connection
con.close()
