In [1]:
from dotenv import load_dotenv
import os
import requests
import json
import pandas as pd
import datetime as dt
from pathlib import Path

import numpy as np
# %matplotlib inline
# import matplotlib
# import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [2]:
load_dotenv() 

True

In [3]:
api_key = os.getenv("ZIPCODE_API_KEY")

In [4]:
# Entering in variables, could make this an input feature if we have time
zip_code = 77002
distance = 5
units = "miles"
url = f"http://www.zipcodeapi.com/rest/{api_key}/radius.json/{zip_code}/{distance}/{units}"

In [5]:
response_data = requests.get(url)
data = response_data.json()
print(json.dumps(data, indent=4))

{
    "zip_codes": [
        {
            "zip_code": "77021",
            "distance": 4.071,
            "city": "Houston",
            "state": "TX"
        },
        {
            "zip_code": "77030",
            "distance": 4.082,
            "city": "Houston",
            "state": "TX"
        },
        {
            "zip_code": "77005",
            "distance": 4.396,
            "city": "Houston",
            "state": "TX"
        },
        {
            "zip_code": "77023",
            "distance": 3.714,
            "city": "Houston",
            "state": "TX"
        },
        {
            "zip_code": "77004",
            "distance": 2.211,
            "city": "Houston",
            "state": "TX"
        },
        {
            "zip_code": "77046",
            "distance": 4.39,
            "city": "Houston",
            "state": "TX"
        },
        {
            "zip_code": "77098",
            "distance": 3.418,
            "city": "Houston",
            "state": "T

In [6]:
zip_code = []
city = []

for line in data["zip_codes"]:

    new_zip_code = int(line["zip_code"])
    zip_code.append(new_zip_code)
    
    new_city = line["city"]
    city.append(new_city)

In [7]:
zip_code_df = pd.DataFrame()
zip_code_df['zip_code']  = zip_code
zip_code_df['City']  = city

In [8]:
def filter_zip_code(zip_codes_list, df, new_dataframe):
    for zip_code_numbers in zip_codes_list:
        if zip_code_numbers in df:
            data_adding = df.loc[:, zip_code_numbers]
            new_dataframe = new_dataframe.append(data_adding)
    return new_dataframe  

In [9]:
# importing the data
clean_path = Path("../resources/Vu/Tax_Data/cleaned_tax_FINALs.csv")

clean_tax = pd.read_csv(clean_path)

clean_tax = clean_tax.rename({'Zip Code': 'zip_code'}, axis=1)

clean_tax = clean_tax.astype({'zip_code': 'int'})

clean_tax.drop(columns = "Size of adjusted gross income", inplace = True)

multi_index_tax = clean_tax.groupby(["zip_code", "date"]).sum()


# creating a new dataframe for each of the columns
number_of_returns_df = pd.pivot_table(multi_index_tax, index = "date", values = "Number of returns", columns = "zip_code")

taxable_income_df = pd.pivot_table(multi_index_tax, index = "date", values = "Taxable income", columns = "zip_code")

income_tax_df = pd.pivot_table(multi_index_tax, index = "date", values = "Income tax", columns = "zip_code")

mortgage_tax_df = pd.pivot_table(multi_index_tax, index = "date", values = "Mortgage", columns = "zip_code")

charity_tax_df = pd.pivot_table(multi_index_tax, index = "date", values = "Charity", columns = "zip_code")

real_estates_tax_df = pd.pivot_table(multi_index_tax, index = "date", values = "Real Estates", columns = "zip_code")



# Setting index to datetime and sorting index(date)
number_of_returns_df.index = pd.to_datetime(number_of_returns_df.index)
number_of_returns_df.sort_index(inplace=True)

taxable_income_df.index = pd.to_datetime(taxable_income_df.index)
taxable_income_df.sort_index(inplace=True)

income_tax_df.index = pd.to_datetime(income_tax_df.index)
income_tax_df.sort_index(inplace=True)

mortgage_tax_df.index = pd.to_datetime(mortgage_tax_df.index)
mortgage_tax_df.sort_index(inplace=True)

charity_tax_df.index = pd.to_datetime(charity_tax_df.index)
charity_tax_df.sort_index(inplace=True)

real_estates_tax_df.index = pd.to_datetime(real_estates_tax_df.index)
real_estates_tax_df.sort_index(inplace=True)

In [10]:
permits_filepath = Path("../resources/dan/Permit_Data/permit_data.csv")

permits_df = pd.read_csv(permits_filepath)

permits_df.dropna(inplace=True)

permits_df = permits_df.astype({'zip_code': 'int64'})

permits_df.set_index(pd.to_datetime(permits_df['date'], format='%Y%m', infer_datetime_format=True), inplace=True)

permits_df.drop('date', axis=1, inplace=True)

permit_counts = permits_df.groupby(["zip_code", "date"]).sum()

permits_by_zip = pd.pivot_table(permit_counts, index = "date", values = "count", columns = "zip_code")

# permits_by_zip.index = permits_by_zip.index.to_period('M')

permits_by_zip = permits_by_zip.groupby(permits_by_zip.index).sum()

permits_by_zip.fillna(0, inplace=True)

permits_by_zip = permits_by_zip.astype(int)


In [11]:
csvpath = Path("../resources/Jack/Texas_Pricing_Data.csv")

pricing_data_df = pd.read_csv(csvpath)

pricing_data_df.set_index(pd.to_datetime(pricing_data_df['month_date_yyyymm'], format='%Y%m', infer_datetime_format=True), inplace=True)

pricing_data_df.index.rename('date', inplace=True)

pricing_data_df.index.rename('date', inplace=True)

pricing_data_df.sort_values(by=['postal_code','date'], inplace=True)

pricing_data_df['pct_price_change'] = 0

pricing_data_df['pct_price_sqft_change'] = 0

pricing_data_df['pct_price_sqft_change'] = pricing_data_df['median_listing_price_per_square_feet'].pct_change()

pricing_data_df.index.rename('date', inplace=True)

median_list_sqft_df = pd.pivot_table(pricing_data_df, index='date', columns = 'postal_code', values = 'median_listing_price_per_square_feet')

median_list_sqft_df_pct_change = median_list_sqft_df.pct_change()

active_listings_df = pd.pivot_table(pricing_data_df, index='date', columns = 'postal_code', values = 'active_listing_count')

new_listing_df = pd.pivot_table(pricing_data_df, index='date', columns = 'postal_code', values = 'new_listing_count')

average_listing_price_df = pd.pivot_table(pricing_data_df, index='date', columns = 'postal_code', values = 'average_listing_price')

pending_ratio_df = pd.pivot_table(pricing_data_df, index='date', columns = 'postal_code', values = 'pending_ratio')

In [12]:
filtered_median_list_sqft_df = pd.DataFrame()
filtered_median_list_sqft_df = filter_zip_code(zip_code, median_list_sqft_df, filtered_median_list_sqft_df)
filtered_median_list_sqft_df.fillna(0, inplace = True)
filtered_median_list_sqft_df = filtered_median_list_sqft_df.transpose()


filtered_median_list_sqft_df_pct_change = pd.DataFrame()
filtered_median_list_sqft_df_pct_change = filter_zip_code(zip_code, median_list_sqft_df_pct_change, filtered_median_list_sqft_df_pct_change)
filtered_median_list_sqft_df_pct_change.fillna(0, inplace = True)
filtered_median_list_sqft_df_pct_change = filtered_median_list_sqft_df_pct_change.transpose()

filtered_active_listings_df = pd.DataFrame()
filtered_active_listings_df = filter_zip_code(zip_code, active_listings_df, filtered_active_listings_df)
filtered_active_listings_df.fillna(0, inplace = True)
filtered_active_listings_df = filtered_active_listings_df.transpose()

filtered_new_listing_df = pd.DataFrame()
filtered_new_listing_df = filter_zip_code(zip_code, new_listing_df, filtered_new_listing_df)
filtered_new_listing_df.fillna(0, inplace = True)
filtered_new_listing_df = filtered_new_listing_df.transpose()

filtered_average_listing_price_df = pd.DataFrame()
filtered_average_listing_price_df = filter_zip_code(zip_code, average_listing_price_df, filtered_average_listing_price_df)
filtered_average_listing_price_df.fillna(0, inplace = True)
filtered_average_listing_price_df = filtered_average_listing_price_df.transpose()

filtered_pending_ratio_df = pd.DataFrame()
filtered_pending_ratio_df = filter_zip_code(zip_code, pending_ratio_df, filtered_pending_ratio_df)
filtered_pending_ratio_df.fillna(0, inplace = True)
filtered_pending_ratio_df = filtered_pending_ratio_df.transpose()


In [13]:
filtered_permits_by_zip = pd.DataFrame()
filtered_permits_by_zip = filter_zip_code(zip_code, permits_by_zip, filtered_permits_by_zip)
filtered_permits_by_zip.fillna(0, inplace = True)
filtered_permits_by_zip = filtered_permits_by_zip.transpose()

In [14]:
filtered_number_of_returns_df = pd.DataFrame()
filtered_number_of_returns_df = filter_zip_code(zip_code, number_of_returns_df, filtered_number_of_returns_df)
filtered_number_of_returns_df.fillna(0, inplace = True)
filtered_number_of_returns_df = filtered_number_of_returns_df.transpose()

filtered_taxable_income_df = pd.DataFrame()
filtered_taxable_income_df = filter_zip_code(zip_code, taxable_income_df, filtered_taxable_income_df)
filtered_taxable_income_df.fillna(0, inplace = True)
filtered_taxable_income_df = filtered_taxable_income_df.transpose()

filtered_income_tax_df = pd.DataFrame()
filtered_income_tax_df = filter_zip_code(zip_code, income_tax_df, filtered_income_tax_df)
filtered_income_tax_df.fillna(0, inplace = True)
filtered_income_tax_df = filtered_income_tax_df.transpose()

filtered_mortgage_tax_df = pd.DataFrame()
filtered_mortgage_tax_df = filter_zip_code(zip_code, mortgage_tax_df, filtered_mortgage_tax_df)
filtered_mortgage_tax_df.fillna(0, inplace = True)
filtered_mortgage_tax_df = filtered_mortgage_tax_df.transpose()

filtered_charity_tax_df = pd.DataFrame()
filtered_charity_tax_df = filter_zip_code(zip_code, charity_tax_df, filtered_charity_tax_df)
filtered_charity_tax_df.fillna(0, inplace = True)
filtered_charity_tax_df = filtered_charity_tax_df.transpose()

filtered_real_estates_tax_df = pd.DataFrame()
filtered_real_estates_tax_df = filter_zip_code(zip_code, real_estates_tax_df, filtered_real_estates_tax_df)
filtered_real_estates_tax_df.fillna(0, inplace = True)
filtered_real_estates_tax_df = filtered_real_estates_tax_df.transpose()



In [15]:
# tax data
filtered_real_estates_tax_df.reset_index(inplace = True)
filtered_charity_tax_df.reset_index(inplace = True)
filtered_mortgage_tax_df.reset_index(inplace = True)
filtered_income_tax_df.reset_index(inplace = True)
filtered_taxable_income_df.reset_index(inplace = True)
filtered_number_of_returns_df.reset_index(inplace = True)

# housing data
filtered_pending_ratio_df.reset_index(inplace = True)
filtered_average_listing_price_df.reset_index(inplace = True)
filtered_new_listing_df.reset_index(inplace = True)
filtered_active_listings_df.reset_index(inplace = True)
filtered_median_list_sqft_df_pct_change.reset_index(inplace = True)
filtered_median_list_sqft_df.reset_index(inplace = True)

# permit data
filtered_permits_by_zip.reset_index(inplace = True)

In [16]:
# correlation calculation
dict_of_corr = {}

for i in zip_code:
    if i in filtered_median_list_sqft_df and i in filtered_median_list_sqft_df_pct_change and i in filtered_active_listings_df and i in filtered_new_listing_df and i in filtered_average_listing_price_df and i in filtered_pending_ratio_df and i in filtered_permits_by_zip and i in filtered_number_of_returns_df and i in filtered_taxable_income_df and i in filtered_income_tax_df and i in filtered_mortgage_tax_df and i in filtered_charity_tax_df and i in filtered_real_estates_tax_df:
        dict_of_corr[i] = pd.DataFrame({
#             'Median Listing Sqft' : filtered_median_list_sqft_df[i],
            'Median Listing of Sqft Pct Change' : filtered_median_list_sqft_df_pct_change[i], 
            'Active Listings' : filtered_active_listings_df[i], 
            'New Listings' : filtered_new_listing_df[i], 
            'Average Listing Price' : filtered_average_listing_price_df[i],
            'Pending Ratio' : filtered_pending_ratio_df[i],
            'Number of Permits' : filtered_permits_by_zip[i],
            'Number of Tax Returns' : filtered_number_of_returns_df[i],
            'Average Taxable Income' : filtered_taxable_income_df[i],
            'Average Income Tax' : filtered_income_tax_df[i],
            'Average Mortgage Tax' : filtered_mortgage_tax_df[i],
            'Average Charity Tax' : filtered_charity_tax_df[i],
            'Average Real Estates Tax' : filtered_real_estates_tax_df[i]
            }).corr()
        np.fill_diagonal(dict_of_corr[i].values, -2)

In [17]:
# tells you which value has the highest correlation to x
# dict_of_frames = {}
# lr = LinearRegression()

# for i in dict_of_corr.keys():
#     if i in filtered_median_list_sqft_df and i in filtered_median_list_sqft_df_pct_change and i in filtered_active_listings_df and i in filtered_new_listing_df and i in filtered_average_listing_price_df and i in filtered_pending_ratio_df and i in filtered_permits_by_zip and i in filtered_number_of_returns_df and i in filtered_taxable_income_df and i in filtered_income_tax_df and i in filtered_mortgage_tax_df and i in filtered_charity_tax_df and i in filtered_real_estates_tax_df:
#         dict_of_frames[i] = pd.DataFrame({
# #             'Median Listing Sqft' : filtered_median_list_sqft_df[i],
#             'Median Listing of Sqft Pct Change' : filtered_median_list_sqft_df_pct_change[i], 
#             'Active Listings' : filtered_active_listings_df[i], 
#             'New Listings' : filtered_new_listing_df[i], 
#             'Average Listing Price' : filtered_average_listing_price_df[i],
#             'Pending Ratio' : filtered_pending_ratio_df[i],
#             'Number of Permits' : filtered_permits_by_zip[i],
#             'Number of Tax Returns' : filtered_number_of_returns_df[i],
#             'Average Taxable Income' : filtered_taxable_income_df[i],
#             'Average Income Tax' : filtered_income_tax_df[i],
#             'Average Mortgage Tax' : filtered_mortgage_tax_df[i],
#             'Average Charity Tax' : filtered_charity_tax_df[i],
#             'Average Real Estates Tax' : filtered_real_estates_tax_df[i]
#             })
#     dict_of_frames[i].dropna(inplace=True)
#     max_df = dict_of_corr[i].agg({'Median Listing of Sqft Pct Change' : ['max']})
#     if max_df.values[0][0] == -2:
#         print(f"Data Set Incomplete: Removing {[i]}")
#         #del dict_of_frames[i]
#     else:
#         strong_corr = dict_of_corr[i].index[dict_of_corr[i]['Median Listing of Sqft Pct Change'] == max_df.values[0][0]].tolist()
#         print(f"{strong_corr[0]} {[i]}")
#         lr.fit(dict_of_frames[i][[strong_corr[0]]], dict_of_frames[i]['Median Listing of Sqft Pct Change'])
        
        
        
        
        
dict_of_frames = {}
lr = LinearRegression()
dict_of_roi = {}

for i in dict_of_corr.keys():
    if i in filtered_median_list_sqft_df and i in filtered_median_list_sqft_df_pct_change and i in filtered_active_listings_df and i in filtered_new_listing_df and i in filtered_average_listing_price_df and i in filtered_pending_ratio_df and i in filtered_permits_by_zip and i in filtered_number_of_returns_df and i in filtered_taxable_income_df and i in filtered_income_tax_df and i in filtered_mortgage_tax_df and i in filtered_charity_tax_df and i in filtered_real_estates_tax_df:
        dict_of_frames[i] = pd.DataFrame({
#             'Median Listing Sqft' : filtered_median_list_sqft_df[i],
            'Median Listing of Sqft Pct Change' : filtered_median_list_sqft_df_pct_change[i], 
            'Active Listings' : filtered_active_listings_df[i], 
            'New Listings' : filtered_new_listing_df[i], 
            'Average Listing Price' : filtered_average_listing_price_df[i],
            'Pending Ratio' : filtered_pending_ratio_df[i],
            'Number of Permits' : filtered_permits_by_zip[i],
            'Number of Tax Returns' : filtered_number_of_returns_df[i],
            'Average Taxable Income' : filtered_taxable_income_df[i],
            'Average Income Tax' : filtered_income_tax_df[i],
            'Average Mortgage Tax' : filtered_mortgage_tax_df[i],
            'Average Charity Tax' : filtered_charity_tax_df[i],
            'Average Real Estates Tax' : filtered_real_estates_tax_df[i]
            })
    dict_of_frames[i].dropna(inplace=True)
    max_df = dict_of_corr[i].agg({'Median Listing of Sqft Pct Change' : ['max']})
    if max_df.values[0][0] == -2:
        print(f"Data Set Incomplete: Removing {[i]}")
        del dict_of_frames[i]
    else:
        strong_corr = dict_of_corr[i].index[dict_of_corr[i]['Median Listing of Sqft Pct Change'] == max_df.values[0][0]].tolist()
        print(f"{strong_corr[0]} {[i]}")
        index_df = pd.DataFrame(np.array(dict_of_frames[i].index))
        lr.fit(index_df[[0]], dict_of_frames[i][[strong_corr[0]]])
        future = lr.predict([index_df.iloc[-1]+12])
        lr.fit(dict_of_frames[i][[strong_corr[0]]], dict_of_frames[i]['Median Listing of Sqft Pct Change'])
        print(f"ROI:") 
        print(lr.predict(future))
        dict_of_roi[i] = lr.predict(future)

Pending Ratio [77021]
ROI:
[-0.00266863]
Pending Ratio [77030]
ROI:
[-0.02466863]
Average Income Tax [77005]
ROI:
[0.0055603]
Average Listing Price [77023]
ROI:
[0.00706455]
New Listings [77004]
ROI:
[-0.00069256]
Average Mortgage Tax [77046]
ROI:
[-0.01013335]
Average Listing Price [77098]
ROI:
[0.00309937]
Average Charity Tax [77027]
ROI:
[0.00870845]
Number of Permits [77006]
ROI:
[0.0002646]
Average Listing Price [77011]
ROI:
[0.03385541]
New Listings [77003]
ROI:
[0.00190372]
Pending Ratio [77019]
ROI:
[-0.00130119]
Average Charity Tax [77002]
ROI:
[0.00579749]
Average Listing Price [77007]
ROI:
[0.00519829]
Average Listing Price [77020]
ROI:
[0.02705338]
Average Listing Price [77009]
ROI:
[0.00531868]
Average Listing Price [77008]
ROI:
[0.00691096]
Pending Ratio [77026]
ROI:
[0.00462537]


In [28]:
import operator
x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0}
sorted_x = sorted(x.items(), key=operator.itemgetter(1),reverse = True)

In [29]:
sorted_x

[(3, 4), (4, 3), (1, 2), (2, 1), (0, 0)]

In [30]:
sorted_dict_of_roi = sorted(dict_of_roi.items(), key = operator.itemgetter(1), reverse = True)

In [31]:
sorted_dict_of_roi

[(77011, array([0.03385541])),
 (77020, array([0.02705338])),
 (77027, array([0.00870845])),
 (77023, array([0.00706455])),
 (77008, array([0.00691096])),
 (77002, array([0.00579749])),
 (77005, array([0.0055603])),
 (77009, array([0.00531868])),
 (77007, array([0.00519829])),
 (77026, array([0.00462537])),
 (77098, array([0.00309937])),
 (77003, array([0.00190372])),
 (77006, array([0.0002646])),
 (77004, array([-0.00069256])),
 (77019, array([-0.00130119])),
 (77021, array([-0.00266863])),
 (77046, array([-0.01013335])),
 (77030, array([-0.02466863]))]