In [5]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import numpy as np
import joblib

In [3]:
df = pd.read_csv('../data/export.csv')
df

Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2021-01-01,4.0,,,,,,15.3,,,
1,2021-01-02,6.5,3.0,12.0,0.0,,,9.6,,1021.5,
2,2021-01-03,9.5,3.0,18.0,0.0,,,9.2,,1016.1,
3,2021-01-04,10.5,4.9,18.0,0.0,,,5.7,,1018.6,
4,2021-01-05,11.2,3.6,19.0,0.0,,,10.3,,1020.0,
...,...,...,...,...,...,...,...,...,...,...,...
1656,2025-07-15,29.6,26.0,35.0,8.3,,,8.4,,1015.1,
1657,2025-07-16,30.3,26.0,35.0,0.0,,,8.5,,,
1658,2025-07-17,31.1,24.8,37.5,0.0,,,10.1,,1013.6,
1659,2025-07-18,32.6,25.1,39.5,0.0,,,8.3,,1014.1,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1661 entries, 0 to 1660
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    1661 non-null   object 
 1   tavg    1644 non-null   float64
 2   tmin    1640 non-null   float64
 3   tmax    1640 non-null   float64
 4   prcp    1639 non-null   float64
 5   snow    0 non-null      float64
 6   wdir    0 non-null      float64
 7   wspd    1644 non-null   float64
 8   wpgt    3 non-null      float64
 9   pres    1637 non-null   float64
 10  tsun    0 non-null      float64
dtypes: float64(10), object(1)
memory usage: 142.9+ KB


In [229]:
df['date'] = pd.to_datetime(df['date'])
df.set_index(df['date'],inplace=True)
df.drop(columns=['date','snow','wdir','wpgt','tsun'],inplace=True)
df.dropna(subset=['tavg','tmin','tmax','pres','prcp','wspd'],inplace=True)
df['target'] = df.shift(-1)['tmax']
df = df.ffill()
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1635 entries, 2021-01-02 to 2025-07-18
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tavg    1635 non-null   float64
 1   tmin    1635 non-null   float64
 2   tmax    1635 non-null   float64
 3   prcp    1635 non-null   float64
 4   wspd    1635 non-null   float64
 5   pres    1635 non-null   float64
 6   target  1635 non-null   float64
dtypes: float64(7)
memory usage: 102.2 KB


In [230]:
models = {
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

In [231]:
X = df[['tavg', 'tmax', 'tmin', 'pres','prcp','wspd']]
y = df['target']
ts = TimeSeriesSplit(n_splits=5)
mae_scores = {}

for name, model in models.items():
    scores = cross_val_score(
        model,
        X,
        y,
        cv=ts,
        scoring=make_scorer(mean_absolute_error, greater_is_better=False)
    )
    mae_scores[name] = -np.mean(scores)  # Flip sign to get positive MAE

for name, score in mae_scores.items():
    print(f"{name}: MAE = {score:.2f}")

Ridge: MAE = 2.78
Lasso: MAE = 2.77
RandomForest: MAE = 2.83
XGBoost: MAE = 3.08


In [232]:
X = df[['tavg', 'tmax', 'tmin', 'pres']]
y = df['target']
lasso = Lasso(alpha=0.1)
lasso.fit(X, y)
y_pred = lasso.predict(X)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"MAE  = {mae:.2f}")
print(f"R²   = {r2:.3f}")

MAE  = 2.72
R²   = 0.842


In [121]:
# coef = pd.Series(lasso.coef_, index=X)
# coef_sorted = coef.reindex(coef.abs().sort_values(ascending=False).index)
# print(coef_sorted)

tavg    0.682214
tmax    0.309500
pres    0.192187
tmin    0.001802
dtype: float64


In [233]:
diff = pd.DataFrame({
    'Actual': y,
    'Predicted': y_pred,
    'Difference': y - y_pred
})

print(diff)

            Actual  Predicted  Difference
date                                     
2021-01-02    18.0  12.822651    5.177349
2021-01-03    18.0  15.691562    2.308438
2021-01-04    19.0  16.856698    2.143302
2021-01-05    15.0  17.911696   -2.911696
2021-01-06     8.0  15.641981   -7.641981
...            ...        ...         ...
2025-07-13    33.0  31.743912    1.256088
2025-07-14    35.0  32.644279    2.355721
2025-07-15    37.5  34.511802    2.988198
2025-07-17    39.5  36.020378    3.479622
2025-07-18    39.5  37.760226    1.739774

[1635 rows x 3 columns]


In [223]:
df

Unnamed: 0_level_0,tavg,tmin,tmax,prcp,wspd,pres,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-02,6.5,3.0,12.0,0.0,9.6,1021.5,18.0
2021-01-03,9.5,3.0,18.0,0.0,9.2,1016.1,18.0
2021-01-04,10.5,4.9,18.0,0.0,5.7,1018.6,19.0
2021-01-05,11.2,3.6,19.0,0.0,10.3,1020.0,15.0
2021-01-06,11.6,6.9,15.0,4.1,22.2,1013.2,8.0
...,...,...,...,...,...,...,...
2025-07-13,26.6,24.0,32.0,10.2,10.4,1016.2,33.0
2025-07-14,27.3,22.0,33.0,2.9,8.3,1016.8,35.0
2025-07-15,29.6,26.0,35.0,8.3,8.4,1015.1,37.5
2025-07-17,31.1,24.8,37.5,0.0,10.1,1013.6,39.5


In [235]:
df.to_csv('../data/meteostat.csv',index=False)

In [236]:
joblib.dump(lasso,'../app/model/lasso_model.joblib')

['../app/model/lasso_model.joblib']