In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/kaggle/input/pokemon/pokemon.csv')

In [None]:
df.head()

In [None]:
cols_missing_val = df.columns[df.isnull().any()].tolist()
print(cols_missing_val)

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot_hist(column, df):
    fig, ax = plt.subplots(figsize=(10, 5))
    print(column)
    df[column].hist(ax=ax)
    plt.show()

In [None]:
for col in cols_missing_val:
    plot_hist(col, df)

In [None]:
from sklearn.model_selection import train_test_split
from abc import ABC, abstractmethod
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
df.is_legendary.value_counts(normalize=True)

In [None]:
def split_dataset(dataset, split_instance):
    """
    Split the dataset using a split instance which should has a
    split method. Returns a dict with the split dataset.
    """
    return split_instance.split(dataset)

class Split(ABC):
    """Split dataset abstract class."""

    @abstractmethod
    def split(self, dataset):
        """split method.

        Should return the data splits.
        """
        pass

class SplitDefault(Split):
    """Split dataset into train, validation, full_train and test."""

    def __init__(self, split_col='is_legendary'):
        self._split_col = split_col

    def split(self, df):
        X = df.drop('is_legendary', axis=1)
        y = df['is_legendary']
        X_full_train, X_test, y_full_train, y_test = train_test_split(
            X, y, random_state=77, stratify=y, train_size=.8
        )

        X_train, X_valid, y_train, y_valid = train_test_split(
            X_full_train, y_full_train, random_state=77, stratify=y_full_train, train_size=.8
        )

        return {
            'train': (X_train, y_train),
            'valid': (X_valid, y_valid),
            'full_train': (X_full_train, y_full_train),
            'test': (X_test, y_test),
        }

In [None]:
class ColumnSelectorTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, cols_to_eliminate):
        self._cols_to_eliminate = cols_to_eliminate # lista de columnas no deseadas

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """Transform method. X is a pandas dataframe."""
        return X.drop(self._cols_to_eliminate, axis=1)

In [None]:
def processing_1():
    return Pipeline([
        ('column_selector', ColumnSelectorTransformer(cols_to_eliminate=['abilities']))
    ])

def processing_2():
    return Pipeline([
        ('column_selector', ColumnSelectorTransformer(cols_to_eliminate=['abilities'])),
        ('simple_imputer', SimpleImputer(strategy='most_frequent'))
    ])

def processing_3():
    return Pipeline([
        ('column_selector', ColumnSelectorTransformer(cols_to_eliminate=['abilities'])),
        ('simple_imputer', SimpleImputer(strategy='constant', fill_value='hormann'))
    ])

In [None]:
def run_pipeline(df, func_pipe_to_run, convert_to_pandas=False):
    pipeline = func_pipe_to_run()
    split_instance = SplitDefault()
    # split the data
    split_data = split_dataset(df.copy(), split_instance)

    # fit and transform only in train
    pipeline.fit(split_data['train'][0], split_data['train'][1])
    X_train_new = pipeline.transform(split_data['train'][0])
    # transform in validation
    X_valid_new = pipeline.transform(split_data['valid'][0])

    # fit and transform only in full train
    pipeline.fit(split_data['full_train'][0], split_data['full_train'][1])
    X_full_train_new = pipeline.transform(split_data['full_train'][0])
    # transform in test
    X_test_new = pipeline.transform(split_data['test'][0])
    
    if convert_to_pandas:
        columns = list(split_data['train'][0].columns)
        columns = columns[1:]
        X_train_new = pd.DataFrame(X_train_new, columns=columns)
        X_valid_new = pd.DataFrame(X_train_new, columns=columns)
        X_full_train_new = pd.DataFrame(X_train_new, columns=columns)
        X_test_new = pd.DataFrame(X_train_new, columns=columns)

    return {
        'train': (X_train_new, split_data['train'][1]),
        'valid': (X_valid_new, split_data['valid'][1]),
        'full_train': (X_full_train_new, split_data['full_train'][1]),
        'test': (X_test_new, split_data['test'][0])
    }

### Predictions

In [None]:
data_after_pipeline = run_pipeline(df, processing_1)

In [None]:
data_after_pipeline['train'][0].head()

In [None]:
data_after_pipeline = run_pipeline(df, processing_2, convert_to_pandas=True)

In [None]:
data_after_pipeline['train'][0].head()