In [62]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor

In [63]:
df = pd.read_csv('../data/export.csv')
df

Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2021-01-01,4.0,,,,,,15.3,,,
1,2021-01-02,6.5,3.0,12.0,0.0,,,9.6,,1021.5,
2,2021-01-03,9.5,3.0,18.0,0.0,,,9.2,,1016.1,
3,2021-01-04,10.5,4.9,18.0,0.0,,,5.7,,1018.6,
4,2021-01-05,11.2,3.6,19.0,0.0,,,10.3,,1020.0,
...,...,...,...,...,...,...,...,...,...,...,...
1656,2025-07-15,29.6,26.0,35.0,8.3,,,8.4,,1015.1,
1657,2025-07-16,30.3,26.0,35.0,0.0,,,8.5,,,
1658,2025-07-17,31.1,24.8,37.5,0.0,,,10.1,,1013.6,
1659,2025-07-18,32.6,25.1,39.5,0.0,,,8.3,,1014.1,


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1661 entries, 0 to 1660
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    1661 non-null   object 
 1   tavg    1644 non-null   float64
 2   tmin    1640 non-null   float64
 3   tmax    1640 non-null   float64
 4   prcp    1639 non-null   float64
 5   snow    0 non-null      float64
 6   wdir    0 non-null      float64
 7   wspd    1644 non-null   float64
 8   wpgt    3 non-null      float64
 9   pres    1637 non-null   float64
 10  tsun    0 non-null      float64
dtypes: float64(10), object(1)
memory usage: 142.9+ KB


In [64]:
df['date'] = pd.to_datetime(df['date'])
df.set_index(df['date'],inplace=True)
df.drop(columns=['date','snow','wdir','wpgt','tsun'],inplace=True)
df.dropna(subset=['tavg','tmin','tmax','pres','prcp','wspd'],inplace=True)

for i in range(1,8):
    df[f"target_{i}"] = df['tmax'].shift(-i)
df = df.ffill()
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1636 entries, 2021-01-02 to 2025-07-19
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tavg      1636 non-null   float64
 1   tmin      1636 non-null   float64
 2   tmax      1636 non-null   float64
 3   prcp      1636 non-null   float64
 4   wspd      1636 non-null   float64
 5   pres      1636 non-null   float64
 6   target_1  1636 non-null   float64
 7   target_2  1636 non-null   float64
 8   target_3  1636 non-null   float64
 9   target_4  1636 non-null   float64
 10  target_5  1636 non-null   float64
 11  target_6  1636 non-null   float64
 12  target_7  1636 non-null   float64
dtypes: float64(13)
memory usage: 178.9 KB


In [71]:
def pct_diff(old,new):
    return (new - old) / old

def compute_rolling(df,horizon,col):
    label = f"rolling_{horizon}_{col}"
    df[label] = df[col].rolling(horizon).mean()
    df[f"{label}_pct"] = pct_diff(df[label],df[col])
    return df
rolling_horizon = [3,14]
for horizon in rolling_horizon:
    for col in ['tmax','tmin','prcp']:
        df = compute_rolling(df,horizon,col)

df = df.iloc[14:,:]
df = df.ffill()

def expand_mean(df):
    return df.expanding().mean()

for col in ['tmax','tmin','prcp']:
    df[f"month_avg_{col}"] = df[col].groupby(df.index.month,group_keys=False).transform(expand_mean)
    df[f"day_avg_{col}"] = df[col].groupby(df.index.day_of_year,group_keys=False).transform(expand_mean)

df = df.replace([np.inf, -np.inf],np.nan)
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1615 entries, 2021-01-19 to 2025-07-19
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tavg                 1615 non-null   float64
 1   tmin                 1615 non-null   float64
 2   tmax                 1615 non-null   float64
 3   prcp                 1615 non-null   float64
 4   wspd                 1615 non-null   float64
 5   pres                 1615 non-null   float64
 6   target_1             1615 non-null   float64
 7   target_2             1615 non-null   float64
 8   target_3             1615 non-null   float64
 9   target_4             1615 non-null   float64
 10  target_5             1615 non-null   float64
 11  target_6             1615 non-null   float64
 12  target_7             1615 non-null   float64
 13  rolling_3_tmax       1615 non-null   float64
 14  rolling_3_tmax_pct   1615 non-null   float64
 15  rolling_3_tmin      

In [72]:
target_cols = [f"target_{i}" for i in range(1,8)]
X = df.drop(columns=target_cols)
y = df[target_cols]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [73]:
models = {
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1,max_iter=10000),
    "XGBoost": XGBRegressor(n_estimators=100,random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=100,random_state=42)
}

In [74]:
ts = TimeSeriesSplit(n_splits=5)
mae_scores = {}

for name, model in models.items():
    scores = cross_val_score(
        model,
        X_scaled,
        y,
        cv=ts,
        scoring=make_scorer(mean_absolute_error, greater_is_better=False)
    )
    mae_scores[name] = -np.mean(scores)  # Flip sign to get positive MAE

for name, score in mae_scores.items():
    print(f"{name}: MAE = {score:.2f}")

Ridge: MAE = 4.09
Lasso: MAE = 3.97
XGBoost: MAE = 4.40
RandomForest: MAE = 4.28


In [75]:
lasso = MultiOutputRegressor(Lasso(alpha=0.1))
lasso.fit(X_scaled, y)
y_pred = lasso.predict(X_scaled)
for i, target in enumerate(target_cols):
    mae = mean_absolute_error(y[target], y_pred[:, i])
    r2 = r2_score(y[target], y_pred[:, i])
    print(f"Day {i+1}: MAE = {mae:.2f}, R² = {r2:.3f}")

Day 1: MAE = 2.59, R² = 0.855
Day 2: MAE = 3.39, R² = 0.753
Day 3: MAE = 3.71, R² = 0.705
Day 4: MAE = 3.86, R² = 0.684
Day 5: MAE = 3.95, R² = 0.667
Day 6: MAE = 4.03, R² = 0.653
Day 7: MAE = 4.08, R² = 0.645


In [54]:
for i, est in enumerate(lasso.estimators_):
    print(f"\nDay {i+1}:")
    importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': est.coef_
    }).sort_values(by='Coefficient', key=abs, ascending=False)
    print(importance_df)


Day 1:
                Feature  Coefficient
0                  tavg     5.170387
2                  tmax     1.783077
4                  wspd    -0.703305
18       month_avg_tmax     0.619056
12      rolling_14_tmax     0.373322
21         day_avg_tmin     0.315813
11   rolling_3_prcp_pct    -0.299216
13  rolling_14_tmax_pct     0.262971
17  rolling_14_prcp_pct    -0.250924
5                  pres     0.207538
3                  prcp    -0.186232
20       month_avg_tmin     0.181855
15  rolling_14_tmin_pct     0.045094
9    rolling_3_tmin_pct    -0.012800
22       month_avg_prcp     0.003940
19         day_avg_tmax     0.000000
14      rolling_14_tmin     0.000000
16      rolling_14_prcp     0.000000
6        rolling_3_tmax     0.000000
1                  tmin     0.000000
10       rolling_3_prcp    -0.000000
8        rolling_3_tmin     0.000000
7    rolling_3_tmax_pct    -0.000000
23         day_avg_prcp     0.000000

Day 2:
                Feature  Coefficient
0                  tav

In [56]:
features_to_drop = importance_df[importance_df['Coefficient'] <= 0]['Feature']
df_reduced = df.drop(columns=features_to_drop)
target_cols = [f"target_{i}" for i in range(1, 8)]
X_reduced = df_reduced.drop(columns=target_cols)
y = df_reduced[target_cols]

lasso = MultiOutputRegressor(Lasso(alpha=0.1))
lasso.fit(X_reduced, y)
y_pred = lasso.predict(X_reduced)
for i, target in enumerate(target_cols):
    mae = mean_absolute_error(y[target], y_pred[:, i])
    r2 = r2_score(y[target], y_pred[:, i])
    print(f"Day {i+1}: MAE = {mae:.2f}, R² = {r2:.3f}")

Day 1: MAE = 2.72, R² = 0.841
Day 2: MAE = 3.42, R² = 0.747
Day 3: MAE = 3.74, R² = 0.701
Day 4: MAE = 3.86, R² = 0.684
Day 5: MAE = 3.95, R² = 0.668
Day 6: MAE = 4.02, R² = 0.653
Day 7: MAE = 4.09, R² = 0.644


In [58]:
df_reduced

Unnamed: 0_level_0,tavg,pres,target_1,target_2,target_3,target_4,target_5,target_6,target_7,rolling_3_prcp,rolling_14_tmax,rolling_14_tmax_pct,day_avg_tmax,month_avg_tmin,day_avg_tmin,month_avg_prcp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2021-01-19,11.7,1025.6,9.6,13.0,12.4,13.0,16.0,20.0,14.0,2.266667,12.242857,0.257876,15.40,4.500000,10.00,1.700000
2021-01-20,7.7,1026.6,13.0,12.4,13.0,16.0,20.0,14.0,10.0,4.066667,11.857143,-0.190361,9.60,5.000000,7.00,2.440000
2021-01-21,10.8,1014.8,12.4,13.0,16.0,20.0,14.0,10.0,9.0,4.166667,12.214286,0.064327,13.00,5.366667,7.20,2.083333
2021-01-22,10.3,1016.1,13.0,16.0,20.0,14.0,10.0,9.0,15.0,1.966667,12.457143,-0.004587,12.40,5.785714,8.30,1.814286
2021-01-23,10.8,1018.0,16.0,20.0,14.0,10.0,9.0,15.0,23.0,0.333333,12.814286,0.014493,13.00,6.162500,8.80,1.650000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-07-14,27.3,1016.8,35.0,37.5,39.5,37.2,37.2,37.2,37.2,4.666667,33.142857,-0.004310,34.58,25.708696,25.10,1.005072
2025-07-15,29.6,1015.1,37.5,39.5,37.2,37.2,37.2,37.2,37.2,7.133333,33.285714,0.051502,35.64,25.710791,25.62,1.057554
2025-07-17,31.1,1013.6,39.5,37.2,37.2,37.2,37.2,37.2,37.2,3.733333,33.607143,0.115834,37.70,25.704286,26.30,1.050000
2025-07-18,32.6,1014.1,37.2,37.2,37.2,37.2,37.2,37.2,37.2,2.766667,34.214286,0.154489,37.74,25.700000,26.80,1.042553


In [59]:
df_reduced.to_csv('../data/7_days.csv',index=False)

In [60]:
joblib.dump(lasso,'../app/model/7_days.joblib')

['../app/model/7_days.joblib']