In [8]:
#imports for data exploration and analysis
import pandas as pd
import numpy as np
#imports for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
#importing models
from sklearn.ensemble import RandomForestRegressor, ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor

from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import SVR

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score


from sklearn.preprocessing import OneHotEncoder

import pickle

from urllib.parse import urlparse

In [10]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("wine-track")

AttributeError: partially initialized module 'mlflow' has no attribute 'version' (most likely due to a circular import)

In [None]:
#reading the data with a function to avoid too much repetition
def read_dafaframe(filename):
    df = pd.read_csv(filename)

    encoder = OneHotEncoder(sparse=False)

    encoded_types = encoder.fit_transform(df['type'].values.reshape(-1, 1))

    label= ['red','white']
    wine_types = pd.DataFrame(encoded_types, columns= label)

    df['red_wine'] = wine_types['red']

    for col, value in df.items():
        if col != 'type':
            df[col] = df[col].fillna(df[col].mean())
        

    return df

In [None]:
train_df = read_dafaframe('./wine_data/train_wine_data.csv')
validation_df = read_dafaframe('./wine_data/test_wine_data.csv')

In [None]:
trained_dict =  train_df.drop(columns=['type','quality'])
val_dict =  validation_df.drop(columns=['type','quality'])

In [None]:
X_train = trained_dict
X_val = val_dict

In [None]:
target = 'quality'
y_train = train_df[target].values

y_val = validation_df[target].values

In [None]:
rfc = RandomForestClassifier(n_estimators=100)

rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_val)

rmse = mean_squared_error(y_val,y_pred_rfc,squared=False)
accuracy = r2_score(y_val, y_pred_rfc)

print('RandomForestClassifier')
print(f'RMSE: {rmse}')
print(f'Accuracy: {accuracy}')

RandomForestClassifier
RMSE: 0.06681016826772575
Accuracy: 0.9941457731172564


In [None]:
etc = ExtraTreesClassifier(n_estimators=100)

etc.fit(X_train, y_train)
y_pred_etc = etc.predict(X_val)

rmse = mean_squared_error(y_val,y_pred_etc,squared=False)
accuracy = r2_score(y_val,y_pred_etc)

print('ExtraTreesClassifier')
print (f'RMSE: ',{rmse})
print(f'Accuracy: ',{accuracy})

ExtraTreesClassifier
RMSE:  {0.06446521696900595}
Accuracy:  {0.9945495129022732}


In [None]:
with open('models/etc.bin','wb') as f_out:
    pickle.dump((etc),f_out)

In [None]:
with mlflow.start_run():
    mlflow.set_tag("developer","Sven")
    mlflow.log_param("train-data-path", "../wine_data/train_wine_data.csv")
    mlflow.log_param("valid-data-path", "../wine_data/test_wine_data.csv")

    n_estimators = 100
    mlflow.log_param('n_estimators',n_estimators)

    etc = ExtraTreesClassifier(n_estimators=n_estimators)
    
    etc.fit(X_train, y_train)
    y_pred_etc = etc.predict(X_val)

    rmse = mean_squared_error(y_val,y_pred_etc,squared=False)
    accuracy = r2_score(y_val,y_pred_etc)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("accuracy", accuracy)

In [None]:
import xgboost as xgb

from hyperopt import  fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

NameError: name 'X_train' is not defined

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        accuracy = r2_score(y_val,y_pred)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("accuracy", accuracy)

    return {'loss': rmse, 'accuracy':accuracy, 'status': STATUS_OK}




In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

job exception: name 'mlflow' is not defined



  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]


NameError: name 'mlflow' is not defined

In [None]:
params ={
'learning_rate':	0.20830135987468326,
'max_depth':	10,
'min_child_weight':	9.327167178212004,
'objective':	'reg:linear',
'reg_alpha':	0.012980494736910292,
'reg_lambda':	0.0559890740248936,
'seed':	42
}

mlflow.xgboost.autolog()

booster = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=1000,
    evals=[(valid, 'validation')],
    early_stopping_rounds=50
)


mlflow.xgboost.log_model(booster, artifact_path="models")

2024/01/18 17:40:47 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8b4f4fbb54514bdebea181d89612aa4c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:4.28088
[1]	validation-rmse:3.40449
[2]	validation-rmse:2.71342
[3]	validation-rmse:2.16790
[4]	validation-rmse:1.73820
[5]	validation-rmse:1.39954
[6]	validation-rmse:1.13357
[7]	validation-rmse:0.92946
[8]	validation-rmse:0.76743
[9]	validation-rmse:0.64408
[10]	validation-rmse:0.54921
[11]	validation-rmse:0.47781
[12]	validation-rmse:0.42098
[13]	validation-rmse:0.38219
[14]	validation-rmse:0.35126
[15]	validation-rmse:0.32875
[16]	validation-rmse:0.31109
[17]	validation-rmse:0.29743
[18]	validation-rmse:0.28597
[19]	validation-rmse:0.27277
[20]	validation-rmse:0.26657
[21]	validation-rmse:0.25887
[22]	validation-rmse:0.25434
[23]	validation-rmse:0.25194
[24]	validation-rmse:0.24965
[25]	validation-rmse:0.24844
[26]	validation-rmse:0.24239
[27]	validation-rmse:0.24083
[28]	validation-rmse:0.23946
[29]	validation-rmse:0.23356
[30]	validation-rmse:0.22863
[31]	validation-rmse:0.22292
[32]	validation-rmse:0.21590
[33]	validation-rmse:0.21080
[34]	validation-rmse:0.2

<mlflow.models.model.ModelInfo at 0x158080e4c40>