# Preprocesamiento

In [1]:
import mlflow
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


def preprocessing(filename):
    with mlflow.start_run(run_name='preprocessing') as mlrun:
        # Some preprocessing steps here
        df = pd.read_csv(filename)
        df_cleaned = df.loc[:, df.columns != 'specimen_number'].copy()
        df_cleaned[df_cleaned.columns] = df_cleaned[df_cleaned.columns].astype(float)
        df_cleaned['species'] = df_cleaned['species'].astype(int)
        df_cleaned.to_csv('preprocessed_data.csv', index=False)
        mlflow.log_artifact('preprocessed_data.csv')

        # logging
        mlflow.log_param(key='n_samples', value=len(df_cleaned))
        mlflow.log_param(key='n_features', value=len(df_cleaned.columns)-1)

        mlflow.log_param(key='n_classes', value=len(df_cleaned['species'].unique()))
        mlflow.log_param(key='problem_type', value='classification')

        # Splitting
        class_name = 'species'
        X = df_cleaned.loc[:, df_cleaned.columns != class_name].copy()
        y = df_cleaned[class_name].copy()

        test_size = 0.2
        random_state = 42
        np.random.seed(random_state)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
                                                            random_state=random_state)

        X_train.to_csv('X_train.csv', index=False)
        mlflow.log_artifact('X_train.csv')

        X_test.to_csv('X_test.csv', index=False)
        mlflow.log_artifact('X_test.csv')

        y_train.to_csv('y_train.csv', index=False)
        mlflow.log_artifact('y_train.csv')

        y_test.to_csv('y_test.csv', index=False)
        mlflow.log_artifact('y_test.csv')

        mlflow.log_param(key='x_train_len', value=len(X_train))
        mlflow.log_param(key='x_test_len', value=len(X_test))
        mlflow.log_param(key='test_percentage', value=test_size)
        mlflow.log_param(key='random_state_split', value=random_state)

In [2]:
filename = "data.csv"
preprocessing(filename)