In [2]:
import pandas as pd 
import numpy as np 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,cross_val_predict
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error


In [8]:
zhvi_dataset = pd.read_csv("../Zip(zhvf_and_zhvi)_combined/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")
zhvi_dataset.head()
zhvi_dataset_clean = zhvi_dataset.iloc[1:,[2]+list(range(9, zhvi_dataset.shape[1]))]
zhvi_dataset_clean
df_melted = zhvi_dataset_clean.melt(id_vars=['RegionName'], var_name='Month', value_name='Price')

In [4]:
#with mortgage rate 
dataset = pd.read_csv("features.csv")
df_melted = dataset[["RegionName","Price","Date","Avg_Mortgage_y"]]
df_melted['lag_mortgage_rate'] = df_melted.groupby('RegionName')['Avg_Mortgage_y'].shift(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_melted['lag_mortgage_rate'] = df_melted.groupby('RegionName')['Avg_Mortgage_y'].shift(1)


Unnamed: 0,RegionName,Price,Date,Avg_Mortgage_y,lag_mortgage_rate
0,77494,210115.649133,2000-01-31,8.21,
1,8701,138190.224310,2000-01-31,8.21,
2,77449,103030.935699,2000-01-31,8.21,
3,11368,149541.567251,2000-01-31,8.21,
4,77084,102153.208791,2000-01-31,8.21,
...,...,...,...,...,...
7822084,98934,321416.544453,2024-09-30,6.18,6.5
7822085,15731,78644.070337,2024-09-30,6.18,6.5
7822086,46799,194649.933906,2024-09-30,6.18,6.5
7822087,22731,367017.449346,2024-09-30,6.18,6.5


In [9]:
#generate lag features
for lag in range(1, 13):  # Lags 1 to 12 months
    df_melted[f'Price_t-{lag}'] = df_melted.groupby('RegionName')['Price'].shift(lag)
df_melted = df_melted.dropna()

In [10]:
def scale_data(X,Y):
    Xscaler = StandardScaler()
    Yscaler = StandardScaler()
    #Fit the scaler on the feature data and transform it
    Xscaler = Xscaler.fit(X)
    Yscaler = Yscaler.fit(Y)
    return Xscaler, Yscaler

In [30]:
def train(model, df, Xscaler,Yscaler):
    #normalize features
    models ={}
    features = [f'Price_t-{lag}' for lag in range(1, 13)]
    #features =     ["Avg_Mortgage_y","lag_mortgage_rate"] +features

    sum_rmse = 0
    sum_bias=0
    sum_var =0
    #s=1000000000000000000 
    for region in df["RegionName"].unique():
        df_region = df [df["RegionName"] == region]
        #s = min(s,df_region.shape[0])
        
        X = df_region[features]
        y = df_region[["Price"]]
        X = Xscaler.transform(X)
        y = Yscaler.transform(y)
        #poly = PolynomialFeatures(degree=1)
        X_poly = X
        X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, shuffle=True)
        
        trained_model = model.fit(X_train,y_train)
        rmse,bias = test_model(trained_model,X_test,y_test)
        sum_rmse+= rmse
        y_pred_cv = cross_val_predict(model, X_poly, y, cv=3)
        variance  = np.var(y_pred_cv)
        sum_bias+=bias
        sum_var+=variance
        models[region] = trained_model 
    l = len(df["RegionName"].unique())    
    print(f'Avg RMSE : {sum_rmse/l}, Bias : {sum_bias/l} Variance : {sum_var/l}')
    #print(f"MIN SHAPE: {s}")
    #print(f'Avg RMSE : {sum_rmse}, Bias : {sum_bias} Variance : {sum_var}')

    return models

In [41]:
#valyes for a year
def predict_vals(models,df,Xscaler, Yscaler):
    predicted_df = pd.DataFrame(columns=["RegionName","1_month_Change", "1_quarter_change","1_year_change"])
    for region in df["RegionName"].unique():
        model = models[region]
        X = df[df["RegionName"]==region]
        X = X.sort_values(by ="Month")
        value = X.iloc[-1,2]
        X = X.iloc[-1,2:-1].values.reshape(1,-1)
        
        X = Xscaler.transform(X)
        
        forecast = []
        for step in range(1, 13):  # Predict next 12 months
            next_pred = model.predict(X)
            saved_val = Yscaler.inverse_transform(next_pred)
            forecast.append(saved_val[0])
            
            # Update the known values for next prediction
            X = np.roll(X, 1)  # Shift values
            X[0, 0] = next_pred

        one_month = ( forecast[0]-value)*100/(value)
        one_quart = (forecast[2]-value)*100/(value)
        one_yr = (forecast[-1]-value)*100/(value)
        
        new_row = [int(region),one_month[0],one_quart[0],one_yr[0]]
        predicted_df.loc[len(predicted_df)] = new_row
    return predicted_df


In [23]:
def test_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test,y_pred))
    bias_test = np.mean(y_pred - y_test)
    return rmse, bias_test

In [24]:

def test_Zillow(zillow_vals, pred_vals):
    #sign accuracy
    #metrics = pd.DataFrame(columns=["RegionName","one_month_sign","quarter_sign, year_sign, month_diff,quarter_diff,year_diff"])
    metrics=[]
    for region in pred_vals["RegionName"].unique():
        preds = pred_vals[pred_vals["RegionName"]==region]
        zill = zillow_vals[zillow_vals["RegionName"]==region] 
        row = [0 for _ in range(7)]
        if (zill.size==0 or preds.size ==0):
            continue
        row[0] = region
        for i in range(3):
            row[1+i] = 1 if preds.iloc[0, i+1] * zill.iloc[0,i+1] > 0 else 0
            row[i+4] = abs(preds.iloc[0, i+1] - zill.iloc[0,i+1])
            #actual accuracy

        metrics.append(row)
    #print(metrics)
    return pd.DataFrame(metrics, columns=["RegionName","one_month_sign","quarter_sign", "year_sign", "month_diff","quarter_diff","year_diff"])


In [17]:
df_melted.iloc[:,3:]

Unnamed: 0,Avg_Mortgage_y,lag_mortgage_rate,Price_t-1,Price_t-2,Price_t-3,Price_t-4,Price_t-5,Price_t-6,Price_t-7,Price_t-8,Price_t-9,Price_t-10,Price_t-11,Price_t-12
316044,7.0325,7.382,213093.513140,211931.255870,211069.140968,210694.883579,210924.920824,210892.224390,211651.051951,211804.731094,211769.525584,210785.302020,210298.010538,210115.649133
316045,7.0325,7.382,151024.307132,149501.590152,147739.390682,146044.282089,144593.453230,143431.480283,142330.265154,141138.728786,140122.689094,139146.069961,138760.574409,138190.224310
316046,7.0325,7.382,105008.778888,104617.189884,104219.472405,103927.949065,103570.880872,103334.339399,103147.149314,102956.005800,102909.028048,102918.703258,103045.172365,103030.935699
316047,7.0325,7.382,170178.368854,167654.850307,164868.438727,162409.717867,160452.105268,158612.701558,156797.662253,155344.147796,154007.712591,152144.240684,151027.858485,149541.567251
316048,7.0325,7.382,103982.496411,103447.591620,102963.876608,102604.911713,102245.015810,102056.679873,101957.960490,101821.442052,101846.094741,101928.968853,102110.366773,102153.208791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7822084,6.1800,6.500,320764.218076,320634.651602,320524.648836,319889.190789,318021.383705,316342.572126,316036.662313,316960.510844,317575.939749,318408.474425,319302.280069,320108.460515
7822085,6.1800,6.500,79027.439166,78758.257426,78545.548531,77494.125540,76810.852760,76292.584509,75943.183415,75736.976645,75298.402346,74260.344287,72577.420014,70618.581019
7822086,6.1800,6.500,193784.325850,193407.728139,193009.236918,192174.483801,190328.300440,187934.494495,185437.183211,183994.040865,183644.769024,183797.381885,183886.612602,183682.866409
7822087,6.1800,6.500,366492.471800,365094.615463,362540.939700,358966.135405,354618.405321,350853.871976,348974.319054,348768.561788,349379.606951,349013.713142,348149.652384,346553.596872


In [None]:
 
Xscaler, Yscaler =scale_data(df_melted.iloc[:,3:], df_melted[["Price"]])

model = LinearRegression()

models = train(model,df_melted,Xscaler,Yscaler)


#deg 1 
#Avg RMSE : 0.0044223905698899365, Bias : -2.5886051662027297e-06 Variance : 0.9999761310104978
#deg 2
#Avg RMSE : 0.004333156182301406, Bias : -1.4639728318716744e-05 Variance : 1.0000407851217685
#deg 3


#seperate models:
#deg 1 
#Avg RMSE : 0.0030284515300179377, Bias : 8.325120845802212e-07 Variance : 0.16635787647935452

#deg 2 
#Avg RMSE : 0.010346305057809124, Bias : 3.1022310135396126e-05 Variance : 0.17779711232489395



Avg RMSE : 0.0030284515300179377, Bias : 8.325120845802212e-07 Variance : 0.16635787647935452


In [None]:
mew_df = predict_vals(models,df_melted,Xscaler,Yscaler)

In [47]:
mew_df.to_csv("./basic_lin_model.csv")

In [None]:
print(mew_df.iloc[1])

In [48]:
zillow_data = pd.read_csv("/home/armaan10/Desktop/UIC_MS/coursework/CS_418/project-check-in-team/Zip(zhvf_and_zhvi)_combined/Zip_zhvf_growth_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")

In [49]:
zillow_data = zillow_data.iloc[:, [2,-3,-2,-1]]
zillow_data

Unnamed: 0,RegionName,2024-10-31,2024-12-31,2025-09-30
0,77494,0.1,-0.5,-1.1
1,8701,0.5,0.1,-0.4
2,77449,-0.2,-1.1,-1.2
3,11368,-0.2,-0.9,-0.5
4,77084,-0.2,-0.9,-1.2
...,...,...,...,...
20157,31648,-1.2,-1.1,-0.1
20158,33855,-0.1,-0.7,0.5
20159,34138,-0.9,-2.3,0.2
20160,48853,0.3,0.5,0.2


In [50]:
metrics = test_Zillow(zillow_data, mew_df)

In [8]:
def hypothesis_test(modelA,modelB):
    pA =  np.mean(modelA)#: uncomment beginning and insert  code for mean of sampleA
    pB = np.mean(modelB)# : uncomment beginning and insert  code for mean of sampleB
    n_A = len(modelA)
    n_B = len(modelB)
    comnbined = (np.sum(modelA)+np.sum(modelB))/(n_A+n_B)
    zhat = (pA-pB)/np.sqrt(comnbined*(1-comnbined)*(1/n_A + 1/n_B))
    pvalue = 2*(1-norm.cdf(abs(zhat)))
    return pvalue
    

In [3]:
mets = pd.read_csv('./linear_reg_metrics.csv')

In [53]:
metrics["one_month_sign"].mean()

np.float64(0.3440806297949155)

In [7]:
print(mets["one_month_sign"].mean())
print(mets["quarter_sign"].mean())
print(mets["year_sign"].mean())

0.3440806297949155
0.4092664092664093
0.4626184626184626


In [9]:
''' sign acc
0.3440806297949155
0.4092664092664093
0.4626184626184626
'''

' sign acc\n0.3440806297949155\n0.4092664092664093\n0.4626184626184626\n'

In [10]:
simple_line = pd.read_csv('./linear_reg_metrics.csv')
simple_mort = pd.read_csv('./mortg_metrics')

FileNotFoundError: [Errno 2] No such file or directory: './mortg_metrics'