In [8]:
# choose python 3.8 conda env
# pip install pycaret
from pycaret.regression import *
import pandas as pd
import numpy as np

df = pd.read_csv(r'https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/1980%202023%20average%20house%20prices.csv')
df['Period'] = pd.to_datetime(df['Period'], format='%Y-%m')
df['Month'] = [i.month for i in df['Period']]
df['Year'] = [i.year for i in df['Period']]

df['Series'] = np.arange(1, len(df)+1)
df

Unnamed: 0,Name,Period,House price index All property types,Average price All property types,Percentage change (monthly) All property types,Percentage change (yearly) All property types,Month,Year,Series
0,United Kingdom,1980-01-01,10.11,19273,3.94,28.59,1,1980,1
1,United Kingdom,1980-02-01,10.11,19273,3.94,28.59,2,1980,2
2,United Kingdom,1980-03-01,10.11,19273,3.94,28.59,3,1980,3
3,United Kingdom,1980-04-01,10.51,20044,4.00,24.15,4,1980,4
4,United Kingdom,1980-05-01,10.51,20044,4.00,24.15,5,1980,5
...,...,...,...,...,...,...,...,...,...
518,United Kingdom,2023-03-01,148.20,282548,-1.00,3.20,3,2023,519
519,United Kingdom,2023-04-01,148.90,283871,0.50,2.50,4,2023,520
520,United Kingdom,2023-05-01,149.50,285053,0.40,1.60,5,2023,521
521,United Kingdom,2023-06-01,151.20,288281,1.10,1.90,6,2023,522


In [9]:
df.drop(['Name',	
        'Period',	
        'House price index All property types',	
        'Percentage change (monthly) All property types',	
        'Percentage change (yearly) All property types'], axis=1, inplace=True)
df = df[['Series', 'Year', 'Month', 'Average price All property types']]
df

Unnamed: 0,Series,Year,Month,Average price All property types
0,1,1980,1,19273
1,2,1980,2,19273
2,3,1980,3,19273
3,4,1980,4,20044
4,5,1980,5,20044
...,...,...,...,...
518,519,2023,3,282548
519,520,2023,4,283871
520,521,2023,5,285053
521,522,2023,6,288281


In [10]:
train = df[(df['Year'] < 2011)]
test = df[df['Year'] >= 2011]

s = setup(data = train, 
          train_size=0.7,
        #   test_data = test, 
          target = 'Average price All property types', 
          fold_strategy = 'timeseries', 
          numeric_features = ['Series'], 
          fold = 3, 
          transform_target = True, 
          session_id = 123,
          data_split_shuffle=False,
          fold_shuffle=False)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Average price All property types
2,Target type,Regression
3,Original data shape,"(372, 4)"
4,Transformed data shape,"(372, 4)"
5,Transformed train set shape,"(260, 4)"
6,Transformed test set shape,"(112, 4)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


In [11]:
best = compare_models(sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lar,Least Angle Regression,9829.0773,232442879.3111,11060.2536,-1.0809,0.2679,0.1846,0.03
knn,K Neighbors Regressor,13027.7733,325006951.3185,15615.4873,-2.8868,0.2979,0.2055,0.0433
et,Extra Trees Regressor,13818.8636,357096144.239,16287.6308,-2.8828,0.3271,0.2254,0.12
lightgbm,Light Gradient Boosting Machine,14183.4899,367663870.7282,16614.876,-3.3447,0.3356,0.2322,0.0767
dt,Decision Tree Regressor,14286.1864,378922380.5572,16703.854,-2.9053,0.3465,0.237,0.03
gbr,Gradient Boosting Regressor,14463.6225,390190086.0551,16858.4292,-2.7515,0.355,0.2414,0.08
rf,Random Forest Regressor,14639.9834,391218773.0602,17007.8831,-3.1839,0.3572,0.2444,0.1567
ada,AdaBoost Regressor,14724.0112,410322073.7861,17001.8684,-2.1882,0.3576,0.2435,0.1733
en,Elastic Net,18367.5013,451724158.3268,19609.905,-131.0969,0.4731,0.3377,0.03
lasso,Lasso Regression,18367.5644,451720322.3938,19609.6659,-131.0969,0.4731,0.3377,0.03


In [12]:
future_dates = pd.date_range(start = '2011-01-01', end = '2030-01-01', freq = 'MS')
future_df = pd.DataFrame()
future_df['Month'] = [i.month for i in future_dates]
future_df['Year'] = [i.year for i in future_dates]    
max_series = df['Series'][df['Year'] < 2010].max()
future_df['Series'] = np.arange(max_series,(max_series+len(future_dates)))
future_df.head()

Unnamed: 0,Month,Year,Series
0,1,2011,360
1,2,2011,361
2,3,2011,362
3,4,2011,363
4,5,2011,364


In [13]:
predictions_future = predict_model(best, data=future_df)

predictions_future.head()

Unnamed: 0,Month,Year,Series,prediction_label,corrected value
0,1,2011,360,126068.601057,415892.601057
1,2,2011,361,126563.381137,416387.381137
2,3,2011,362,127059.070138,416883.070138
3,4,2011,363,127555.667838,417379.667838
4,5,2011,364,128053.174018,417877.174018


In [14]:
import plotly.express as px
concat_df = pd.concat([df[df['Year'] < 2011],predictions_future], axis=0)

fig = px.line(concat_df, x=concat_df['Series'], y=["Average price All property types", "prediction_label"], template = 'plotly_dark')
fig.add_scatter(x=df['Series'], y=df['Average price All property types'])

fig.show()

In [26]:
final_real_val = train['Average price All property types'].iloc[-1]
first_predicted_value = predictions_future['prediction_label'].iloc[1]

print(final_real_val)
predictions_future['corrected value'] = predictions_future['prediction_label'] - first_predicted_value + final_real_val

import plotly.express as px
concat_df = pd.concat([df[df['Year'] < 2011],predictions_future], axis=0)

fig = px.line(concat_df, x=concat_df['Series'], y=["Average price All property types", "corrected value"], template = 'plotly_dark')
fig.add_scatter(x=df['Series'], y=df['Average price All property types'])

fig.show()

168703
