In [128]:
# choose python 3.8 conda env
# pip install pycaret
from pycaret.regression import *
import pandas as pd
import numpy as np

df = pd.read_csv('/workspaces/D2I-Jupyter-Notebook-Tools/ml-data science tutorials/data/1980 2023 average house prices.csv')
df['Period'] = pd.to_datetime(df['Period'], format='%Y-%m')
df['Month'] = [i.month for i in df['Period']]
df['Year'] = [i.year for i in df['Period']]

df['Series'] = np.arange(1, len(df)+1)
df

Unnamed: 0,Name,Period,House price index All property types,Average price All property types,Percentage change (monthly) All property types,Percentage change (yearly) All property types,Month,Year,Series
0,United Kingdom,1980-01-01,10.11,19273,3.94,28.59,1,1980,1
1,United Kingdom,1980-02-01,10.11,19273,3.94,28.59,2,1980,2
2,United Kingdom,1980-03-01,10.11,19273,3.94,28.59,3,1980,3
3,United Kingdom,1980-04-01,10.51,20044,4.00,24.15,4,1980,4
4,United Kingdom,1980-05-01,10.51,20044,4.00,24.15,5,1980,5
...,...,...,...,...,...,...,...,...,...
518,United Kingdom,2023-03-01,148.20,282548,-1.00,3.20,3,2023,519
519,United Kingdom,2023-04-01,148.90,283871,0.50,2.50,4,2023,520
520,United Kingdom,2023-05-01,149.50,285053,0.40,1.60,5,2023,521
521,United Kingdom,2023-06-01,151.20,288281,1.10,1.90,6,2023,522


In [129]:
df.drop(['Name',	
        'Period',	
        'House price index All property types',	
        'Percentage change (monthly) All property types',	
        'Percentage change (yearly) All property types'], axis=1, inplace=True)
df = df[['Series', 'Year', 'Month', 'Average price All property types']]
df

Unnamed: 0,Series,Year,Month,Average price All property types
0,1,1980,1,19273
1,2,1980,2,19273
2,3,1980,3,19273
3,4,1980,4,20044
4,5,1980,5,20044
...,...,...,...,...
518,519,2023,3,282548
519,520,2023,4,283871
520,521,2023,5,285053
521,522,2023,6,288281


In [130]:
train = df[(df['Year'] < 2011)]
test = df[df['Year'] >= 2011]

s = setup(data = train, 
          train_size=0.7,
        #   test_data = test, 
          target = 'Average price All property types', 
          fold_strategy = 'timeseries', 
          numeric_features = ['Year'], 
          fold = 3, 
          transform_target = True, 
          session_id = 123,
          data_split_shuffle=False,
          fold_shuffle=False)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Average price All property types
2,Target type,Regression
3,Original data shape,"(372, 4)"
4,Transformed data shape,"(372, 4)"
5,Transformed train set shape,"(260, 4)"
6,Transformed test set shape,"(112, 4)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


In [131]:
best = compare_models(sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,401.4442,730330.6695,493.4,-342.7522,0.0303,0.0249,0.04
br,Bayesian Ridge,406.7168,637178.7544,460.8611,-361.6791,0.0296,0.0265,0.0267
lr,Linear Regression,406.8419,636100.1089,460.4708,-361.9967,0.0296,0.0265,0.7633
en,Elastic Net,406.8625,636242.5939,460.5224,-414.8965,0.0296,0.0265,0.0267
ridge,Ridge Regression,406.874,636153.6122,460.4902,-361.6717,0.0296,0.0265,0.0267
lar,Least Angle Regression,406.8742,636152.6535,460.4898,-367.6239,0.0296,0.0265,0.0233
lasso,Lasso Regression,406.8759,636150.7158,460.4891,-414.8965,0.0296,0.0265,0.04
llar,Lasso Least Angle Regression,406.8759,636150.7537,460.4892,-414.8965,0.0296,0.0265,0.03
omp,Orthogonal Matching Pursuit,407.1799,635667.6341,460.3143,-362.4192,0.0296,0.0266,0.0233
par,Passive Aggressive Regressor,513.0888,1381573.622,678.6823,-5137734055422490.0,0.0778,0.1171,0.03


In [135]:
future_dates = pd.date_range(start = '2011-01-01', end = '2030-01-01', freq = 'MS')
future_df = pd.DataFrame()
future_df['Month'] = [i.month for i in future_dates]
future_df['Year'] = [i.year for i in future_dates]    
max_series = df['Series'][df['Year'] < 2010].max()
future_df['Series'] = np.arange(max_series,(max_series+len(future_dates)))
future_df.head()

Unnamed: 0,Month,Year,Series
0,1,2011,360
1,2,2011,361
2,3,2011,362
3,4,2011,363
4,5,2011,364


In [136]:
predictions_future = predict_model(best, data=future_df)
predictions_future.head()

Unnamed: 0,Month,Year,Series,prediction_label
0,1,2011,360,125227.057135
1,2,2011,361,125677.136613
2,3,2011,362,126127.973384
3,4,2011,363,126579.567281
4,5,2011,364,127031.918136


In [137]:
import plotly.express as px
concat_df = pd.concat([df[df['Year'] < 2011],predictions_future], axis=0)

fig = px.line(concat_df, x=concat_df['Series'], y=["Average price All property types", "prediction_label"], template = 'plotly_dark')
fig.add_scatter(x=df['Series'], y=df['Average price All property types'])

fig.show()