In [8]:
using DataFrames
using CSV
using XLSX
using Statistics
using Plots
using Dates
using GLM

┌ Info: Precompiling GLM [38e38edf-8417-5370-95a0-9cbb8c7f171a]
└ @ Base loading.jl:1260


In [4]:
prices = XLSX.readtable("data/zillow_data_download_april2020.xlsx", "Sales_median_price_city", "A:ER")
counts = XLSX.readtable("data/zillow_data_download_april2020.xlsx", "Sale_counts_city", "A:ER")

#Columns we care about
atr = 1:4 #regionID, city, state, size atributes
years = 39:147 #2011-2020
colIndex = [atr; years]

pricesDF = DataFrame(prices[1][colIndex], prices[2][colIndex])
pricesData = dropmissing(pricesDF)

countsDF = DataFrame(counts[1][colIndex], counts[2][colIndex])
countsData = dropmissing(countsDF) #So we don't have to re-read to get old data

113-element Array{Int64,1}:
   1
   2
   3
   4
  39
  40
  41
  42
  43
  44
  45
  46
  47
   ⋮
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147

In [155]:
#Make initial plots

#Vectorize input dates
datesInput = collect(Date(2011, 1):Dates.Month(1):Date(2020, 1))

#Top N cities by population we are going to analyze
N = 100

for row = 1:N
    #Vectorize outputs
    countsOutput = collect(countsData[row, 5:end])
    pricesOutput = collect(pricesData[row, 5:end])
    
    #Get city and state name
    cityName = countsData[row, 2]
    stateName = countsData[row, 3]  
    loc = string(cityName, ", ", stateName)
    
    #Best fit
    time = 1:109
    priceFitDF = DataFrame(t = time, medPrice = pricesOutput)
    priceFit = lm(@formula(medPrice ~ t), priceFitDF)
    
    slope = round(GLM.coef(priceFit)[2], digits = 3)
    reg = predict(priceFit)   
                
    #Plot: units sold and best fit, median price and best fit
    plot(datesInput, countsOutput, 
        title = loc, xlabel = "Month", ylabel = "Homes Sold", legend = false)
    png(string("Plots/Homes-Sold/", string(cityName, "-Homes-Sold")))
        
    scatter(datesInput, pricesOutput, 
        title = loc, xlabel = "Month", ylabel = "Median Price (\$)", label = "Zillow Data")
    
    plot!(datesInput, reg, lw = 3, label = "Lin. Reg", legend = :bottomright, linestyle = :dash)
    annotate!(datesInput[1], maximum(pricesOutput), Plots.text(string("m = ", slope), 10, :black, :left))
    
    png(string("Plots/Median-Price/", string(cityName, "-Median-Price")))
end