In [1]:
import pandas as pd
from sklearn.datasets import fetch_openml

# Download the data
digital_currency = fetch_openml(
    "Digital-currency---Time-series", as_frame=True, parser="pandas"
)

# Convert the data to a dataframe
data = (
    digital_currency
    .frame
    .drop(columns=["open_SAR", "high_SAR", "low_SAR", "close_SAR"])
    .rename(columns={"Unnamed:_0": "date"})
    .set_index("date")
)

# Separate features and target
target = "close_USD"
y = data[target]
X = data.drop(target, axis=1)

# Print full data
data.head()

Unnamed: 0_level_0,open_USD,high_USD,low_USD,close_USD,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-30,34246.28,34933.0,32825.0,34218.54,43072
2021-01-29,33368.18,38531.9,31915.4,34252.2,231827
2021-01-28,30362.19,33783.98,29842.1,33364.86,92621
2021-01-27,32464.01,32557.29,29241.72,30366.15,95911
2021-01-26,32254.19,32921.88,30837.37,32467.77,84972


In [15]:
data_soy = pd.read_csv("../data library/Soja_v1.csv", sep = ";")
data_soy.set_index("Data", inplace=True)
data_soy.drop(columns=["Cultivo", "Id"], inplace=True)

target = "Estadual"
y = data_soy[target]
X = data_soy.drop(target, axis = 1)

data_soy.head()

Unnamed: 0_level_0,Estadual,País,Último,Abertura,Máxima,Mínima
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-09-01,123.7153,123.6574,1.06888,1.00012,1.06962,99500.0
2024-08-01,117.5516,119.2007,98200.0,1.03,1.045,93625.0
2024-07-01,120.0369,121.7345,1.0285,1.14825,1.1915,1.0205
2024-06-01,118.9727,121.3981,1.1505,1.20425,1.2055,1.14625
2024-05-01,114.0022,118.2335,1.2035,1.161,1.2565,1.15625


In [16]:
from sklearn.feature_selection import SelectKBest, f_regression

# Select top k features using f_regression
k = 2  # You can adjust this number based on your domain knowledge
selector = SelectKBest(score_func=f_regression, k=k)
X_selected = selector.fit_transform(X, y)

In [17]:
# Indices of top k features
top_indices = selector.get_support(indices=True)

# Top k feature names
top_features = selector.feature_names_in_[top_indices]

# Top k scores
top_scores = selector.scores_[top_indices]

# Print the names and scores of top k features
print(f"{'Feature':<10} Score")
print(f"{'-------':<10} ---------")
for feature, score in sorted(zip(top_features, top_scores)):
    print(f"{feature:<10} {score:.2f}")

Feature    Score
-------    ---------
Mínima     40.08
País       38150.27


In [18]:
from sklearn.model_selection import TimeSeriesSplit, cross_validate
from sklearn.linear_model import LinearRegression

# Splitting the data into train and test sets
train_size = int(X_selected.shape[0] * 0.8)
X_train, y_train = X_selected[:train_size], y[:train_size]
X_test, y_test = X_selected[train_size:], y[train_size:]

# Splitting the train set into train and validation sets
tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits

# Model
model = LinearRegression()

# Cross-validation
cv = cross_validate(
    model,
    X_train,
    y_train,
    cv=tscv,
    scoring=[
        "neg_root_mean_squared_error",
        "r2",
        "neg_mean_absolute_percentage_error"
    ],
)

# Printing the results
print("RMSE")
for i, val in enumerate(cv["test_neg_root_mean_squared_error"]):
    print(f"Fold {i+1}: {-val:.3f}")
print(f"Mean: {-cv['test_neg_root_mean_squared_error'].mean():.3f}")

RMSE
Fold 1: 7.957
Fold 2: 3.218
Fold 3: 2.558
Fold 4: 3.079
Fold 5: 2.245
Mean: 3.811


In [19]:
from sklearn.linear_model import Lasso, Ridge

# Create Lasso and Ridge models
lasso = Lasso(alpha=0.1, max_iter=3000, random_state=42)
ridge = Ridge(alpha=0.1, random_state=42)

# Fit models to training data
lasso.fit(X_train, y_train)
ridge.fit(X_train, y_train)

# Print the coefficients and intercepts from both models
print("Lasso")
print(f"Coefficients: {lasso.coef_}")
print(f"Intercept: {lasso.intercept_:.4f}\n")

print("Ridge")
print(f"Coefficients: {ridge.coef_}")
print(f"Intercept: {ridge.intercept_:.4f}")

Lasso
Coefficients: [9.82709732e-01 5.22940389e-06]
Intercept: -2.6662

Ridge
Coefficients: [9.82781868e-01 5.25484406e-06]
Intercept: -2.6732


In [20]:
import math
import numpy as np
from sklearn.metrics import mean_squared_error

# Initializations
best_alpha = 1.0
best_mse = math.inf

# Create a list of alphas to test against
alpha_values = np.linspace(0.1, 1.0, 10)

for alpha in alpha_values:
    # Model
    model = Ridge(alpha=alpha, random_state=42)
    # Fit
    model.fit(X_train, y_train)
    # Predict
    y_pred = model.predict(X_test)
    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    # Update results if a better one is achieved
    if mse < best_mse:
        best_alpha = alpha
        best_mse = mse

print(f"Best alpha: {best_alpha}")

Best alpha: 0.1
