In [5]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor

In [58]:
df = pd.read_csv('../data/export(1).csv')
df

Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2021-01-01 00:00:00,3.0,1.7,4.4,,,,22.4,,1012.6,
1,2021-01-02 00:00:00,6.3,2.2,12.8,0.0,,,12.6,,1020.1,
2,2021-01-03 00:00:00,9.4,2.2,18.3,0.0,,,16.4,,1014.7,
3,2021-01-04 00:00:00,10.2,3.9,18.3,0.0,,,7.8,,1018.0,
4,2021-01-05 00:00:00,10.6,2.8,18.9,0.0,,,13.0,,1018.8,
...,...,...,...,...,...,...,...,...,...,...,...
1659,2025-07-18 00:00:00,30.0,24.4,35.0,,,,16.0,,1014.8,
1660,2025-07-19 00:00:00,30.0,25.0,35.0,,,,21.6,,1015.2,
1661,2025-07-20 00:00:00,30.7,27.0,35.0,,,,19.5,,1013.9,
1662,2025-07-21 00:00:00,30.7,25.6,36.2,,,,21.2,,1012.9,


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1664 entries, 0 to 1663
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    1664 non-null   object 
 1   tavg    1664 non-null   float64
 2   tmin    1664 non-null   float64
 3   tmax    1664 non-null   float64
 4   prcp    1221 non-null   float64
 5   snow    0 non-null      float64
 6   wdir    0 non-null      float64
 7   wspd    1664 non-null   float64
 8   wpgt    0 non-null      float64
 9   pres    1664 non-null   float64
 10  tsun    0 non-null      float64
dtypes: float64(10), object(1)
memory usage: 143.1+ KB


In [60]:
df['date'] = pd.to_datetime(df['date'])
df.set_index(df['date'],inplace=True)
df.drop(columns=['date','snow','wdir','wpgt','tsun'],inplace=True)
df['prcp'].ffill().bfill()
df['prcp'] = df['prcp'].fillna(0)
for i in range(1,8):
    df[f"target_{i}"] = df['tmax'].shift(-i)
df = df.ffill()
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1664 entries, 2021-01-01 to 2025-07-22
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tavg      1664 non-null   float64
 1   tmin      1664 non-null   float64
 2   tmax      1664 non-null   float64
 3   prcp      1664 non-null   float64
 4   wspd      1664 non-null   float64
 5   pres      1664 non-null   float64
 6   target_1  1664 non-null   float64
 7   target_2  1664 non-null   float64
 8   target_3  1664 non-null   float64
 9   target_4  1664 non-null   float64
 10  target_5  1664 non-null   float64
 11  target_6  1664 non-null   float64
 12  target_7  1664 non-null   float64
dtypes: float64(13)
memory usage: 182.0 KB


In [61]:
def pct_diff(old,new):
    return (new - old) / old

def compute_rolling(df,horizon,col):
    label = f"rolling_{horizon}_{col}"
    df[label] = df[col].rolling(horizon).mean()
    df[f"{label}_pct"] = pct_diff(df[label],df[col])
    return df
rolling_horizon = [7,14]
for horizon in rolling_horizon:
    for col in ['tavg','tmax','tmin','prcp']:
        df = compute_rolling(df,horizon,col)

df = df.iloc[14:,:]
df = df.ffill()

def expand_mean(df):
    return df.expanding().mean()

for col in ['tavg','tmax','tmin','prcp']:
    df[f"month_avg_{col}"] = df[col].groupby(df.index.month,group_keys=False).transform(expand_mean)
    df[f"day_avg_{col}"] = df[col].groupby(df.index.day_of_year,group_keys=False).transform(expand_mean)

df = df.replace([np.inf, -np.inf],np.nan)
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1583 entries, 2021-03-23 to 2025-07-22
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tavg                 1583 non-null   float64
 1   tmin                 1583 non-null   float64
 2   tmax                 1583 non-null   float64
 3   prcp                 1583 non-null   float64
 4   wspd                 1583 non-null   float64
 5   pres                 1583 non-null   float64
 6   target_1             1583 non-null   float64
 7   target_2             1583 non-null   float64
 8   target_3             1583 non-null   float64
 9   target_4             1583 non-null   float64
 10  target_5             1583 non-null   float64
 11  target_6             1583 non-null   float64
 12  target_7             1583 non-null   float64
 13  rolling_7_tavg       1583 non-null   float64
 14  rolling_7_tavg_pct   1583 non-null   float64
 15  rolling_7_tmax      

In [62]:
target_cols = [f"target_{i}" for i in range(1,8)]
X = df.drop(columns=target_cols)
y = df[target_cols]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [63]:
models = {
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1,max_iter=10000),
    "XGBoost": XGBRegressor(n_estimators=100,random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=100,random_state=42)
}

In [64]:
ts = TimeSeriesSplit(n_splits=5)
mae_scores = {}

for name, model in models.items():
    scores = cross_val_score(
        model,
        X_scaled,
        y,
        cv=ts,
        scoring=make_scorer(mean_absolute_error, greater_is_better=False)
    )
    mae_scores[name] = -np.mean(scores)  # Flip sign to get positive MAE

for name, score in mae_scores.items():
    print(f"{name}: MAE = {score:.2f}")

Ridge: MAE = 4.14
Lasso: MAE = 3.97
XGBoost: MAE = 4.54
RandomForest: MAE = 4.24


In [65]:
lasso = MultiOutputRegressor(Lasso(alpha=0.1))
lasso.fit(X_scaled, y)
y_pred = lasso.predict(X_scaled)
for i, target in enumerate(target_cols):
    mae = mean_absolute_error(y[target], y_pred[:, i])
    r2 = r2_score(y[target], y_pred[:, i])
    print(f"Day {i+1}: MAE = {mae:.2f}, R² = {r2:.3f}")

Day 1: MAE = 2.65, R² = 0.836
Day 2: MAE = 3.36, R² = 0.739
Day 3: MAE = 3.66, R² = 0.696
Day 4: MAE = 3.77, R² = 0.682
Day 5: MAE = 3.81, R² = 0.675
Day 6: MAE = 3.84, R² = 0.667
Day 7: MAE = 3.89, R² = 0.660


In [66]:
for i, est in enumerate(lasso.estimators_):
    print(f"\nDay {i+1}:")
    importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': est.coef_
    }).sort_values(by='Coefficient', key=abs, ascending=False)
    print(importance_df)


Day 1:
                Feature  Coefficient
0                  tavg     3.548840
2                  tmax     3.459321
22       month_avg_tavg     0.733547
4                  wspd    -0.709219
5                  pres     0.423183
16      rolling_14_tmax     0.257780
7    rolling_7_tavg_pct     0.174918
27         day_avg_tmin     0.163380
13   rolling_7_prcp_pct    -0.150673
26       month_avg_tmin     0.135078
28       month_avg_prcp     0.068125
21  rolling_14_prcp_pct    -0.042808
20      rolling_14_prcp    -0.000000
25         day_avg_tmax     0.000000
23         day_avg_tavg     0.000000
24       month_avg_tmax     0.000000
18      rolling_14_tmin     0.000000
19  rolling_14_tmin_pct    -0.000000
15  rolling_14_tavg_pct    -0.000000
17  rolling_14_tmax_pct     0.000000
1                  tmin     0.000000
14      rolling_14_tavg     0.000000
12       rolling_7_prcp     0.000000
11   rolling_7_tmin_pct     0.000000
10       rolling_7_tmin     0.000000
9    rolling_7_tmax_pct     0.

In [67]:
joblib.dump(lasso,'../app/model/dallas_fw.joblib')

['../app/model/dallas_fw.joblib']