# Ploomber: First Steps

In [1]:
# Import required libraries
import os
from ploomber.micro import dag_from_functions, grid

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
)
import joblib

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create folders
os.makedirs('data', exist_ok=True)
os.makedirs('models', exist_ok=True)

## Declare Steps

In [3]:
# Constants
features_cols = [
    "PassengerId",
    "Pclass",
    "Name",
    "Sex",
    "Age",
    "SibSp",
    "Parch",
    "Ticket",
    "Fare",
    "Cabin",
    "Embarked"
]

target_col = "Survived"
should_be_index = ["PassengerId"]

In [4]:
# Utility functions
def preprocess_generator(X_train):
    # Example of numerical and categorical variables
    numeric_features = X_train.select_dtypes(
        include=["int64", "float64"]
    ).columns.tolist()
    categorical_features = X_train.select_dtypes(include=["object"]).columns.tolist()

    # Create transformers for numerical and categorical variables
    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )
    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    # Create ColumnTransformer to apply transformations in a pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

    # Fit preprocessor on training data
    preprocessor.fit(X_train)

    return preprocessor


def preprocess_applier(preprocessor, X_data):
    # Apply preprocessing to the data
    X_data_processed = preprocessor.transform(X_data)

    # Get column names after preprocessing
    numeric_feature_names = preprocessor.transformers_[0][-1]
    categorical_feature_names = preprocessor.transformers_[1][-1]

    # Get unique categories of categorical variables
    unique_categories = preprocessor.named_transformers_["cat"]["onehot"].categories_

    # Create column names after OneHotEncoding
    encoded_categorical_feature_names = []
    for i, categories in enumerate(unique_categories):
        for category in categories:
            encoded_categorical_feature_names.append(
                f"{categorical_feature_names[i]}_{category}"
            )

    # Convert sparse matrix to Pandas DataFrame
    transformed_df = pd.DataFrame(
        X_data_processed.toarray(),
        columns=numeric_feature_names + encoded_categorical_feature_names,
    )

    return transformed_df

def evaluate_model(y_pred, y_test):
    """
    Evaluates a classification model using accuracy, precision, recall, and F1-score.

    Parameters:
        y_pred: numpy array or pandas Series, predicted labels.
        y_test: numpy array or pandas Series, true labels.

    Returns:
        dict: dictionary containing evaluation metrics.
    """

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Return evaluation metrics
    return {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
    }


In [5]:
# Read Data
def data():
    df = pd.read_csv("https://raw.githubusercontent.com/fralfaro/ploomber-example/main/data/train.csv")
    
    assert not df.empty, "empty dataframe"
    assert set(df.columns.tolist()) == set(features_cols + [target_col]), "invalid columns"
    assert df.duplicated().sum() == 0, "duplicated data"

    
    return df.set_index(should_be_index)

# Preprocessing Data
def preprocessing_pandas(data):
    # Drop irrelevant columns:
    data = data.drop(columns=["Name", "Ticket"],axis=1)
    data['Cabin'] = data['Cabin'].fillna('-').apply(lambda x: x[0])

    # Handle missing values in Age:
    average_age = round(data["Age"].mean())
    data["Age"] = data["Age"].fillna(average_age)

    # Convert specified columns to string type:
    columns_to_convert = ["Pclass", "SibSp", "Parch"]
    data[columns_to_convert] = data[columns_to_convert].astype(str)
    
    return data

# Split Dataset
def split_train_test(preprocessing_pandas):
    # Select features and target variable
    features = [col for col in preprocessing_pandas.columns if col != target_col]
    X = preprocessing_pandas[features]
    y = preprocessing_pandas[target_col]

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    
    return X_train, X_test, y_train, y_test

# Models preprocessing
def preprocessing_sklearn(split_train_test):
    
    X_train, X_test, y_train, y_test = split_train_test
    
    preprocessor = preprocess_generator(X_train)
    X_train_norm = preprocess_applier(preprocessor, X_train)
    X_test_norm = preprocess_applier(preprocessor, X_test)
    
    return X_train_norm, X_test_norm, y_train, y_test, preprocessor
    

# Machine Learning Models
@grid(model=[RandomForestClassifier],
      n_estimators=[50, 100, 200],
      criterion=["gini", "entropy"],
)
def fit_random_forest(preprocessing_sklearn, model, n_estimators, criterion):    
    X_train, X_test, y_train, y_test, _ = preprocessing_sklearn
    clf = model(n_estimators=n_estimators, criterion=criterion)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    result = evaluate_model(y_test, y_pred)
    result['clf'] = clf
    return result

@grid(model=[AdaBoostClassifier],
      n_estimators=[50, 100, 200],
      learning_rate=[1.0, 2.0],
)
def fit_ada_boost(preprocessing_sklearn, model, n_estimators, learning_rate):    
    X_train, X_test, y_train, y_test, _ = preprocessing_sklearn
    clf = model(n_estimators=n_estimators, learning_rate=learning_rate)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    result = evaluate_model(y_test, y_pred)
    result['clf'] = clf
    return result

@grid(model=[lightgbm.LGBMClassifier],
      n_estimators=[50, 100, 200],
      learning_rate=[0.1, 0.01])  # Common learning rates for LightGBM
def fit_lightgbm(preprocessing_sklearn, model, n_estimators, learning_rate):
    X_train, X_test, y_train, y_test, _ = preprocessing_sklearn
    clf = model(n_estimators=n_estimators, learning_rate=learning_rate)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    result = evaluate_model(y_test, y_pred)
    result['clf'] = clf
    return result


## Create Pipeline

In [6]:
dag = dag_from_functions([
    data,
    preprocessing_pandas,
    split_train_test,
    preprocessing_sklearn,
    fit_random_forest, fit_ada_boost,

])

In [7]:
dag.plot()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 15966.90it/s]


## Run Pipeline

In [8]:
dag.build()

0it [00:00, ?it/s]


name,Ran?,Elapsed (s),Percentage
data,False,0,0
preprocessing_pandas,False,0,0
split_train_test,False,0,0
preprocessing_sklearn,False,0,0
fit_random_forest-0,False,0,0
fit_random_forest-1,False,0,0
fit_random_forest-2,False,0,0
fit_random_forest-3,False,0,0
fit_random_forest-4,False,0,0
fit_random_forest-5,False,0,0


In [9]:
temp = [(name,task) for name, task in dag.items() ]
temp

[('data', _PythonCallableNoValidation: data -> File('output\\data')),
 ('preprocessing_pandas',
  _PythonCallableNoValidation: preprocessing_pandas -> File('output\\preprocessing_pandas')),
 ('split_train_test',
  _PythonCallableNoValidation: split_train_test -> File('output\\split_train_test')),
 ('preprocessing_sklearn',
  _PythonCallableNoValidation: preprocessing_sklearn -> File('output\\preprocessing_sklearn')),
 ('fit_random_forest-0',
  _PythonCallableNoValidation: fit_random_forest-0 -> File('output\\fit_random_forest-0')),
 ('fit_random_forest-1',
  _PythonCallableNoValidation: fit_random_forest-1 -> File('output\\fit_random_forest-1')),
 ('fit_random_forest-2',
  _PythonCallableNoValidation: fit_random_forest-2 -> File('output\\fit_random_forest-2')),
 ('fit_random_forest-3',
  _PythonCallableNoValidation: fit_random_forest-3 -> File('output\\fit_random_forest-3')),
 ('fit_random_forest-4',
  _PythonCallableNoValidation: fit_random_forest-4 -> File('output\\fit_random_forest-

In [10]:
outputs = [task.load() for name, task in dag.items() if name.startswith('fit_')]
metrics = pd.DataFrame(outputs).sort_values('Precision',ascending= False)
metrics

Unnamed: 0,Accuracy,Precision,Recall,F1-Score,clf
6,0.815642,0.783784,0.773333,0.778523,"(DecisionTreeClassifier(max_depth=1, random_st..."
8,0.821229,0.783784,0.783784,0.783784,"(DecisionTreeClassifier(max_depth=1, random_st..."
10,0.815642,0.77027,0.780822,0.77551,"(DecisionTreeClassifier(max_depth=1, random_st..."
2,0.798883,0.743243,0.763889,0.753425,"(DecisionTreeClassifier(max_features='sqrt', r..."
0,0.798883,0.716216,0.779412,0.746479,"(DecisionTreeClassifier(max_features='sqrt', r..."
3,0.793296,0.716216,0.768116,0.741259,"(DecisionTreeClassifier(criterion='entropy', m..."
4,0.782123,0.716216,0.746479,0.731034,"(DecisionTreeClassifier(max_features='sqrt', r..."
5,0.787709,0.716216,0.757143,0.736111,"(DecisionTreeClassifier(criterion='entropy', m..."
1,0.782123,0.702703,0.753623,0.727273,"(DecisionTreeClassifier(criterion='entropy', m..."
7,0.782123,0.702703,0.753623,0.727273,"(DecisionTreeClassifier(max_depth=1, random_st..."


In [13]:
# get model
model = metrics.iloc[0]['clf']

# get preprocessor
outputs = [task.load() for name, task in dag.items() if name == 'preprocessing_sklearn']
preprocessor = outputs[0][4]

In [14]:
# get training values
X_test  = temp[3][1].load()[1]
y_test  = temp[3][1].load()[3]

In [15]:
y_pred = model.predict(X_test)

In [16]:
y_score = model.predict_proba(X_test)[:, 1]

importances = model.feature_importances_
feature_names  = X_test.columns

df_importances = pd.DataFrame({
    'feature_names':feature_names,
    'importances':importances
}).sort_values('importances',ascending=False)

In [17]:
# guardar informacion
metrics.to_csv('data/metrics.csv',sep=',',index=False)
df_importances.to_csv('data/feature_importances.csv',sep=',',index=False)
joblib.dump(model, 'models/best_model.joblib')
joblib.dump(preprocessor, 'models/preprocessor.joblib')

['models/preprocessor.joblib']

## Predictions

In [18]:
best_model =  joblib.load('models/best_model.joblib')
preprocessor =  joblib.load('models/preprocessor.joblib')

In [19]:
def data_testing():
    path = "data/"
    df = pd.read_csv("https://raw.githubusercontent.com/fralfaro/ploomber-example/main/data/test.csv")
    
    assert not df.empty, "empty dataframe"
    assert set(df.columns.tolist()) == set(features_cols ), "invalid columns"
    assert df.duplicated().sum() == 0, "duplicated data"

    
    return df.set_index(should_be_index)

def preprocessing_pandas_testing(data_testing):
    # Drop irrelevant columns:
    data = data_testing.drop(columns=["Name", "Ticket"],axis=1)

    # Handle missing values in Age:
    average_age = round(data["Age"].mean())
    data["Age"] = data["Age"].fillna(average_age)

    # Convert specified columns to string type:
    columns_to_convert = ["Pclass", "SibSp", "Parch"]
    data[columns_to_convert] = data[columns_to_convert].astype(str)
    
    return data


def preprocessing_sklearn_testing(preprocessing_pandas_testing):
    X_testing = preprocess_applier(preprocessor, preprocessing_pandas_testing)
    return X_testing

def predictions(data_testing,preprocessing_sklearn_testing):
    predictions = best_model.predict(preprocessing_sklearn_testing)
    data_testing["Survived"] = predictions
    
    return data_testing

In [20]:
dag_testing = dag_from_functions([
    data_testing,
    preprocessing_pandas_testing,
    preprocessing_sklearn_testing,
    predictions
])

In [21]:
dag_testing.plot()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 3957.82it/s]


In [22]:
dag_testing.build()

Building task 'predictions': 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 10.10it/s]


name,Ran?,Elapsed (s),Percentage
data_testing,True,0.330653,87.3283
preprocessing_pandas_testing,True,0.011476,3.03091
preprocessing_sklearn_testing,True,0.011698,3.08954
predictions,True,0.024805,6.55122


In [23]:
temp_testing = [(name,task) for name, task in dag_testing.items() ]
temp_testing

[('data_testing',
  _PythonCallableNoValidation: data_testing -> File('output\\data_testing')),
 ('preprocessing_pandas_testing',
  _PythonCallableNoValidation: preprocessing_pandas_testing -> File('output\\preprocessing_pandas_testing')),
 ('preprocessing_sklearn_testing',
  _PythonCallableNoValidation: preprocessing_sklearn_testing -> File('output\\preprocessing_sklearn_testing')),
 ('predictions',
  _PythonCallableNoValidation: predictions -> File('output\\predictions'))]

In [24]:
temp_testing[3][1].load()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,0
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1
...,...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1
1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0


In [25]:
# guardar resultados
cols = [
    'Survived','Pclass', 'Name', 
    'Sex', 'Age', 'SibSp', 
    'Parch', 'Ticket', 'Fare',
    'Cabin', 'Embarked'
]
predictions = temp_testing[3][1].load()[cols]

predictions.to_csv('data/predictions.csv',sep=',')

In [26]:
predictions

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,0,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S
