In [1]:
import pandas as pd

def load_data(file_path):
    df = pd.read_csv(file_path)

    return df

file_path = 'data/literacy_rates_clean.csv'
df = load_data(file_path)
df.head()

Unnamed: 0,Region,Country,Year,Age,Gender,Literacy rate
0,Central and Southern Asia,Afghanistan,2011,15+,female,0.176121
1,Central and Southern Asia,Afghanistan,2011,15+,male,0.454171
2,Central and Southern Asia,Afghanistan,2011,15-24,female,0.321132
3,Central and Southern Asia,Afghanistan,2011,15-24,male,0.618791
4,Central and Southern Asia,Afghanistan,2011,25-64,female,0.084128


In [2]:
def eda(df):

    duplicates = df.duplicated().sum()
    missing_values = df.isnull().sum()
    data_types = df.info()

    print(data_types)
    print('--' * 20)
    print(f"Number of duplicates: {duplicates}")
    print(f"\nNumber of missing values: {missing_values}")
    
eda(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3303 entries, 0 to 3302
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Region         3303 non-null   object 
 1   Country        3303 non-null   object 
 2   Year           3303 non-null   int64  
 3   Age            3303 non-null   object 
 4   Gender         3303 non-null   object 
 5   Literacy rate  3303 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 155.0+ KB
None
----------------------------------------
Number of duplicates: 0

Number of missing values: Region           0
Country          0
Year             0
Age              0
Gender           0
Literacy rate    0
dtype: int64


In [3]:
from sklearn.model_selection import train_test_split

# Determine the target variable
y = df['Literacy rate']
X = df.drop(columns='Literacy rate')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline

# Select categorical and numerical columns
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_include='object')

# Instantiate the transformers
scaler = StandardScaler()
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
imputer = KNNImputer(n_neighbors=2)

# Create preprocessing pipeline
def preprocessing_pipeline():

    # Create numeric and categorical pipelines
    num_pipe = Pipeline([
        ('imputer', imputer),
        ('scaler', scaler)
    ])

    cat_pipe = Pipeline([
        ('encoder', encoder)
    ])

    # Create preprocessor
    preprocessor = ColumnTransformer([
        ('numeric', num_pipe, num_cols),
        ('categorical', cat_pipe, cat_cols)
    ])

    return preprocessor

# Apply the preprocessing pipeline
preprocessor = preprocessing_pipeline()
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [5]:
import warnings
from lazypredict.Supervised import LazyRegressor

warnings.filterwarnings('ignore')

# Instantiate the model
lazy_model = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)

# Fit the model
models, predictions = lazy_model.fit(X_train, X_test, y_train, y_test)

# Print the top 5 models
top_5 = models.sort_values('Adjusted R-Squared', ascending=False).head()
top_5

100%|██████████| 42/42 [00:04<00:00,  8.98it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 190
[LightGBM] [Info] Number of data points in the train set: 2642, number of used features: 15
[LightGBM] [Info] Start training from score 0.816639





Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XGBRegressor,0.94,0.94,0.05,0.19
LGBMRegressor,0.87,0.87,0.08,0.08
HistGradientBoostingRegressor,0.87,0.87,0.08,0.23
RandomForestRegressor,0.79,0.79,0.1,0.53
BaggingRegressor,0.75,0.76,0.11,0.08


In [11]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate models
def evaluate_models(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print(f'Mean Absolute Error: {mae:.2f}')
    print(f'Mean Squared Error: {mse:.2f}')
    print(f'Root Mean Squared Error: {rmse:.2f}')
    print(f'R-Squared: {r2:.2f}')

    return mae, mse, rmse, r2

In [12]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

# Run models
models = [
    XGBRegressor(),
    RandomForestRegressor(random_state=42)
]

for model_name in models:
    model_name.fit(X_train_transformed, y_train)
    y_pred = model_name.predict(X_test_transformed)
    evaluate_models(y_test, y_pred)

    print(f'Model: {model_name.__class__.__name__}')   
    print('--' * 20)

Mean Absolute Error: 0.04
Mean Squared Error: 0.00
Root Mean Squared Error: 0.06
R-Squared: 0.92
Model: XGBRegressor
----------------------------------------
Mean Absolute Error: 0.04
Mean Squared Error: 0.01
Root Mean Squared Error: 0.08
R-Squared: 0.88
Model: RandomForestRegressor
----------------------------------------


In [13]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from mlflow.tracking import MlflowClient

# Set up MLflow tracking
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Literacy Rate Prediction")

# Run mlflow ui
# in the terminal run: mlflow ui
# go to http://127.0.0.1:5000

<Experiment: artifact_location='mlflow-artifacts:/599839046319326436', creation_time=1733566381090, experiment_id='599839046319326436', last_update_time=1733566381090, lifecycle_stage='active', name='Literacy Rate Prediction', tags={}>

In [15]:
# Train and track models
with mlflow.start_run(run_name='XGBRegressor'):
    
    # Log parameters
    mlflow.log_param('test_size', 0.2)
    mlflow.log_param('random_state', 42)

    # Train model
    xgb_regressor = XGBRegressor()
    xgb_regressor.fit(X_train_transformed, y_train)

    # Make predictions and evaluate
    y_pred = xgb_regressor.predict(X_test_transformed)
    mae, mse, rmse, r2 = evaluate_models(y_test, y_pred)

    # Log metrics
    mlflow.log_metrics({
        'Mean Absolute Error': mae,
        'Mean Squared Error': mse,
        'Root Mean Squared Error': rmse,
        'R-Squared': r2
    })

Mean Absolute Error: 0.04
Mean Squared Error: 0.00
Root Mean Squared Error: 0.06
R-Squared: 0.92
🏃 View run XGBRegressor at: http://localhost:5000/#/experiments/599839046319326436/runs/b9916c0d12274b4798bc1877eb408795
🧪 View experiment at: http://localhost:5000/#/experiments/599839046319326436


In [16]:
with mlflow.start_run(run_name='RandomForestRegressor'):

    # Log parameters
    mlflow.log_param('test_size', 0.2)
    mlflow.log_param('random_state', 42)

    # Train model
    rf_regressor = RandomForestRegressor(random_state=42)
    rf_regressor.fit(X_train_transformed, y_train)

    # Make predictions
    y_pred = rf_regressor.predict(X_test_transformed)
    mae, mse, rmse, r2 = evaluate_models(y_test, y_pred)

    # Log metrics
    mlflow.log_metrics({
        'Mean Absolute Error': mae,
        'Mean Squared Error': mse,
        'Root Mean Squared Error': rmse,
        'R-Squared': r2
    })

Mean Absolute Error: 0.04
Mean Squared Error: 0.01
Root Mean Squared Error: 0.08
R-Squared: 0.88
🏃 View run RandomForestRegressor at: http://localhost:5000/#/experiments/599839046319326436/runs/149ae4c01f224e39a38fc463f2867a1d
🧪 View experiment at: http://localhost:5000/#/experiments/599839046319326436
