In [None]:
import pandas as pd

def load_data(file_path):
    df = pd.read_csv(file_path)

    return df

file_path = 'data/literacy_rates_clean.csv'
df = load_data(file_path)
df.head()

In [None]:
def eda(df):

    duplicates = df.duplicated().sum()
    missing_values = df.isnull().sum()
    data_types = df.info()

    print(data_types)
    print('--' * 20)
    print(f"Number of duplicates: {duplicates}")
    print(f"\nNumber of missing values: {missing_values}")
    
eda(df)

In [3]:
from sklearn.model_selection import train_test_split

# Determine the target variable
y = df['Literacy rate']
X = df.drop(columns='Literacy rate')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline

# Select categorical and numerical columns
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_include='object')

# Instantiate the transformers
scaler = StandardScaler()
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
imputer = KNNImputer(n_neighbors=2)

# Create preprocessing pipeline
def preprocessing_pipeline():

    # Create numeric and categorical pipelines
    num_pipe = Pipeline([
        ('imputer', imputer),
        ('scaler', scaler)
    ])

    cat_pipe = Pipeline([
        ('encoder', encoder)
    ])

    # Create preprocessor
    preprocessor = ColumnTransformer([
        ('numeric', num_pipe, num_cols),
        ('categorical', cat_pipe, cat_cols)
    ])

    return preprocessor

# Apply the preprocessing pipeline
preprocessor = preprocessing_pipeline()
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [5]:
import warnings
from lazypredict.Supervised import LazyRegressor

warnings.filterwarnings('ignore')

# Instantiate the model
lazy_model = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)

# Fit the model
models, predictions = lazy_model.fit(X_train, X_test, y_train, y_test)

# Print the top 5 models
top_5 = models.sort_values('Adjusted R-Squared', ascending=False).head()
top_5

100%|██████████| 42/42 [00:04<00:00,  8.84it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 190
[LightGBM] [Info] Number of data points in the train set: 2642, number of used features: 15
[LightGBM] [Info] Start training from score 0.816639





Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XGBRegressor,0.94,0.94,0.05,0.19
LGBMRegressor,0.87,0.87,0.08,0.08
HistGradientBoostingRegressor,0.87,0.87,0.08,0.25
RandomForestRegressor,0.79,0.79,0.1,0.53
BaggingRegressor,0.75,0.76,0.11,0.08


In [6]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate models
def evaluate_models(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print(f'Mean Absolute Error: {mae:.2f}')
    print(f'Mean Squared Error: {mse:.2f}')
    print(f'Root Mean Squared Error: {rmse:.2f}')
    print(f'R-Squared: {r2:.2f}')

    return mae, mse, rmse, r2

In [7]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

# Run models
models = [
    XGBRegressor(),
    RandomForestRegressor(random_state=42)
]

for model_name in models:
    model_name.fit(X_train_transformed, y_train)
    y_pred = model_name.predict(X_test_transformed)
    evaluate_models(y_test, y_pred)

    print(f'Model: {model_name.__class__.__name__}')   
    print('--' * 20)

Mean Absolute Error: 0.04
Mean Squared Error: 0.00
Root Mean Squared Error: 0.06
R-Squared: 0.92
Model: XGBRegressor
----------------------------------------
Mean Absolute Error: 0.04
Mean Squared Error: 0.01
Root Mean Squared Error: 0.08
R-Squared: 0.88
Model: RandomForestRegressor
----------------------------------------


In [8]:
# # dagshub
# import dagshub
# dagshub.init(repo_owner='erwincarlogonzales', repo_name='mlflow-basics', mlflow=True)

# import os
# os.environ['MLFLOW_TRACKING_USERNAME'] = 'erwincarlogonzales'
# os.environ['MLFLOW_TRACKING_PASSWORD'] = 'ee5ab363f7458257b89e48a7cce05a2a59cc8e6e'
# os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/erwincarlogonzales/mlflow-basics.mlflow'

from dotenv import load_dotenv
import os
import dagshub

# Load environment variables from .env file
load_dotenv()

# Initialize dagshub with values from .env
dagshub.init(
    repo_owner=os.getenv('DAGSHUB_REPO_OWNER'),
    repo_name=os.getenv('DAGSHUB_REPO_NAME'),
    mlflow=True
)

In [9]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from mlflow.tracking import MlflowClient

# Set up MLflow tracking
mlflow.set_experiment('literacy-rate')
# mlflow.set_tracking_uri('https://dagshub.com/erwincarlogonzales/mlflow-basics.mlflow')

# Run mlflow ui
# in the terminal run: mlflow ui
# go to http://127.0.0.1:5000

<Experiment: artifact_location='mlflow-artifacts:/01dd4db9f9664444976b43ab86f9165c', creation_time=1733667369822, experiment_id='0', last_update_time=1733667369822, lifecycle_stage='active', name='literacy-rate', tags={}>

In [10]:
# Train and track models
with mlflow.start_run(run_name='XGBRegressor'):
    
    # Log parameters
    mlflow.log_param('test_size', 0.2)
    mlflow.log_param('random_state', 42)

    # Train model
    xgb_regressor = XGBRegressor()
    xgb_regressor.fit(X_train_transformed, y_train)

    # Make predictions and evaluate
    y_pred = xgb_regressor.predict(X_test_transformed)
    mae, mse, rmse, r2 = evaluate_models(y_test, y_pred)

    # Log metrics
    mlflow.log_metrics({
        'Mean Absolute Error': mae,
        'Mean Squared Error': mse,
        'Root Mean Squared Error': rmse,
        'R-Squared': r2
    })

In [11]:
with mlflow.start_run(run_name='RandomForestRegressor'):

    # Log parameters
    mlflow.log_param('test_size', 0.2)
    mlflow.log_param('random_state', 42)

    # Train model
    rf_regressor = RandomForestRegressor(random_state=42)
    rf_regressor.fit(X_train_transformed, y_train)

    # Make predictions
    y_pred = rf_regressor.predict(X_test_transformed)
    mae, mse, rmse, r2 = evaluate_models(y_test, y_pred)

    # Log metrics
    mlflow.log_metrics({
        'Mean Absolute Error': mae,
        'Mean Squared Error': mse,
        'Root Mean Squared Error': rmse,
        'R-Squared': r2
    })

In [12]:
from mlflow.tracking import MlflowClient

# Model registry
def register_model(run_id, model_name):
    model_uri = f'runs:/{run_id}/model'

    return mlflow.register_model(model_uri=model_uri, name=model_name)

# Promote model
def promote_challenger_to_production(model_name, prod_name):
    client = MlflowClient()
    current_model_uri = f"models:/{model_name}@challenger"
    client.copy_model_version(src_model_uri=current_model_uri, dst_name=prod_name)

# Retrieve production model
def get_production_champion(prod_name):
    prod_model_uri = f"models:/{prod_name}@champion"
    return mlflow.xgboost.load_model(prod_model_uri)

In [13]:
run_id = '26bbc91b56a542c38d5b5afe473601ff' # Get this from MLflow UI
model_name = 'XGBRegressor'
prod_name = 'literacy-rate-production'

# Register model
model_details = register_model(run_id, model_name)
print(f'Registered model version: {model_details.version}')

In [14]:
# # Promote challenger to production => make sure to add challenger to the model alias
# promote_challenger_to_production(model_name, prod_name)

In [15]:
# Get the champion model => make sure to add champion to the literacy-rate-production model alias
# champion_model = get_production_champion(prod_name)