In [1]:
import pandas as pd

# Read data from xls
df = pd.read_excel('data/DatArticle_orig.xls', header=[0, 1])
df = df[[('Temperature', 'norm'), ('mu', 'norm'), ('qp', 'norm')]]
print(f'Dataset shape: {df.shape}')
df.head()

Dataset shape: (67, 3)


Unnamed: 0_level_0,Temperature,mu,qp
Unnamed: 0_level_1,norm,norm,norm
0,0.0,0.311549,0.145117
1,0.0,0.304558,0.127066
2,0.0,0.293051,0.088543
3,0.0,0.28092,0.065512
4,0.0,0.269754,0.100429


In [12]:
# Split data to features and target
features = [('Temperature', 'norm'), ('mu', 'norm')]
target = [('qp', 'norm')]

X = df[features]
y = df[target]

### ML modeling

In [13]:
# Tune and train a SVR model
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error

In [21]:
model = LinearRegression()

In [27]:
n_splits = 10
kf = KFold(n_splits=n_splits)

MSE_ERROR = []
for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mse_error = mean_squared_error(y_val, y_pred)
    MSE_ERROR.append(mse_error)

print(f'MSE error: {MSE_ERROR}')
print(f'Mean MSE error: {sum(MSE_ERROR) / n_splits}')

MSE error: [0.019954861360688626, 0.03421201515663465, 0.013810253340213019, 0.0037748765935606385, 0.023751700297432343, 0.04386578287907599, 0.07576159263792535, 0.1949377283826763, 0.22165901754759043, 0.009139955974456729]
Mean MSE error: 0.06408677841702541


In [24]:
# Perform manually cross validation to find the best model; Don't use cross validation function of sklearn
def cross_validation(model, X, y, cv=5):
    scores = []
    for i in range(cv):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        scores.append(mean_squared_error(y_test, y_pred))
    return np.mean(scores)

SyntaxError: positional argument follows keyword argument (1812637844.py, line 5)

In [25]:
# Create a function for tuning hyperparameters
def tune_model(model, X, y, params, cv=10, scoring='neg_mean_squared_error'):
    grid = GridSearchCV(model, params, cv=cv, scoring=scoring)
    grid.fit(X, y)
    return grid.best_estimator_

# Create a function for SVR model
def svr_model(X, y):
    model = SVR()
    
    # Define a grid of parameters
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
        'epsilon': [0.01, 0.1, 1, 10, 100],
        'kernel': ['rbf', 'sigmoid', 'linear']
    }

    best_model = tune_model(model, X, y, param_grid)
    return best_model

# Create a function for Random Forest model
def rf_model(X, y):
    model = RandomForestRegressor()

    # Define a grid of parameters
    param_grid = {
        'n_estimators': [10, 50],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4, 10]
    }

    best_model = tune_model(model, X, y, param_grid)
    return best_model

# Create a function for Linear Regression model
def lr_model(X, y):
    model = LinearRegression()
    model.fit(X,y)
    return model

def cv_scores(model, X, y, cv=10, scoring='neg_mean_squared_error'):
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    scores = pd.DataFrame(-scores, columns=['MSE'])
    scores.index = scores.index + 1
    return scores

In [None]:
svr = svr_model(X, y)
scores = cv_scores(svr, X, y)
print(scores)

         MSE
1   0.017778
2   0.056292
3   0.001196
4   0.000420
5   0.003862
6   0.007472
7   0.061502
8   0.183767
9   0.153050
10  0.014392


In [None]:
rf = rf_model(X, y)
scores = cv_scores(rf, X, y)
print(scores)


         MSE
1   0.008818
2   0.052411
3   0.000818
4   0.001156
5   0.066159
6   0.030264
7   0.106721
8   0.182675
9   0.177633
10  0.005027


In [26]:
lr = lr_model(X, y)
scores = cv_scores(lr, X, y)
print(scores)

         MSE
1   0.019955
2   0.034212
3   0.013810
4   0.003775
5   0.023752
6   0.043866
7   0.075762
8   0.194938
9   0.221659
10  0.009140
