# Imports

In [1]:
import numpy as np
import pandas as pd

import sklearn as sk
from sklearn import base
from sklearn import calibration
from sklearn import cluster
from sklearn import compose
from sklearn import covariance
from sklearn import cross_decomposition
from sklearn import datasets
from sklearn import decomposition
from sklearn import discriminant_analysis
from sklearn import dummy
from sklearn import ensemble
from sklearn import exceptions
from sklearn import experimental
from sklearn import feature_extraction
from sklearn import feature_selection
from sklearn import gaussian_process
from sklearn import impute
from sklearn import inspection
from sklearn import isotonic
from sklearn import kernel_approximation
from sklearn import kernel_ridge
from sklearn import linear_model
from sklearn import manifold
from sklearn import metrics
from sklearn import mixture
from sklearn import model_selection
from sklearn import multiclass
from sklearn import multioutput
from sklearn import naive_bayes
from sklearn import neighbors
from sklearn import neural_network
from sklearn import pipeline
from sklearn import preprocessing
from sklearn import random_projection
from sklearn import semi_supervised
from sklearn import svm
from sklearn import tree
from sklearn import utils

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Listing

- `sklearn`
  - config_context
  - get_config
  - set_config
  - show_versions
- `base`:
  - BaseEstimator
  - BiclusterMixin
  - ClassNamePrefixFeaturesOutMixin
  - ClassifierMixin
  - ClusterMixin
  - DensityMixin
  - MetaEstimatorMixin
  - OneToOneFeatureMixin
  - OutlierMixin
  - RegressorMixin
  - TransformerMixin
  - clone
  - is_classifier
  - is_clusterer
  - is_regressor
  - is_outlier_detector
- `calibration`
  - CalibratedClassifierCV
  - calibration_curve
  - CalibrationDisplay
- `cluster`
  - AffinityPropagation
  - AgglomerativeClustering
  - Birch
  - BisectingKMeans
  - DBSCAN
  - FeatureAgglomeration
  - HDBSCAN
  - KMeans
  - MeanShift
  - MiniBatchKMeans
  - OPTICS
  - SpectralBiclustering
  - SpectralClustering
  - SpectralCoclustering
  - affinity_propagation
  - cluster_optics_dbscan
  - cluster_optics_xi
  - compute_optics_graph
  - dbscan
  - estimate_bandwidth
  - k_means
  - kmeans_plusplus
  - mean_shift
  - spectral_clustering
  - ward_tree
- `compose`
  - ColumnTransformer
  - TransformedTargetRegressor
  - make_column_selector
  - make_column_transformer
- `covariance`
  - EllipticEnvelope
  - EmpiricalCovariance
  - GraphicalLasso
  - GraphicalLassoCV
  - LedoitWolf
  - MinCovDet
  - OAS
  - ShrunkCovariance
  - empirical_covariance
  - graphical_lasso
  - ledoit_wolf
  - ledoit_wolf_shrinkage
  - oas
  - shrunk_covariance
- `cross_decomposition`
  - CCA
  - PLSCanonical
  - PLSRegression
  - PLSSVD
- `datasets`
  - clear_data_home
  - dump_svmlight_file
  - fetch_20newsgroups
  - fetch_20newsgroups_vectorized
  - fetch_california_housing
  - fetch_covtype
  - fetch_file
  - fetch_kddcup99
  - fetch_lfw_pairs
  - fetch_lfw_people
  - fetch_olivetti_faces
  - fetch_openml
  - fetch_rcv1
  - fetch_species_distributions
  - get_data_home
  - load_breast_cancer
  - load_diabetes
  - load_digits
  - load_files
  - load_iris
  - load_linnerud
  - load_sample_image
  - load_sample_images
  - load_svmlight_file
  - load_svmlight_files
  - load_wine
  - make_biclusters
  - make_blobs
  - make_checkerboard
  - make_circles
  - make_classification
  - make_friedman1
  - make_friedman2
  - make_friedman3
  - make_gaussian_quantiles
  - make_hastie_10_2
  - make_low_rank_matrix
  - make_moons
  - make_multilabel_classification
  - make_regression
  - make_s_curve
  - make_sparse_coded_signal
  - make_sparse_spd_matrix
  - make_sparse_uncorrelated
  - make_spd_matrix
  - make_swiss_roll
- `decomposition`
  - DictionaryLearning
  - FactorAnalysis
  - FastICA
  - IncrementalPCA
  - KernelPCA
  - LatentDirichletAllocation
  - MiniBatchDictionaryLearning
  - MiniBatchNMF
  - MiniBatchSparsePCA
  - NMF
  - PCA
  - SparseCoder
  - SparsePCA
  - TruncatedSVD
  - dict_learning
  - dict_learning_online
  - fastica
  - non_negative_factorization
  - sparse_encode
- `discriminant_analysis`
  - LinearDiscriminantAnalysis
  - QuadraticDiscriminantAnalysis
- `dummy`
  - DummyClassifier
  - DummyRegressor
- `ensemble`
  - AdaBoostClassifier
  - AdaBoostRegressor
  - BaggingClassifier
  - BaggingRegressor
  - ExtraTreesClassifier
  - ExtraTreesRegressor
  - GradientBoostingClassifier
  - GradientBoostingRegressor
  - HistGradientBoostingClassifier
  - HistGradientBoostingRegressor
  - IsolationForest
  - RandomForestClassifier
  - RandomForestRegressor
  - RandomTreesEmbedding
  - StackingClassifier
  - StackingRegressor
  - VotingClassifier
  - VotingRegressor
- `exceptions`
  - ConvergenceWarning
  - DataConversionWarning
  - DataDimensionalityWarning
  - EfficiencyWarning
  - FitFailedWarning
  - InconsistentVersionWarning
  - NotFittedError
  - UndefinedMetricWarning
  - EstimatorCheckFailedWarning
- `experimental`
  - enable_halving_search_cv
  - enable_iterative_imputer
- `feature_extraction`
  - DictVectorizer
  - FeatureHasher
  - PatchExtractor
  - extract_patches_2d
  - grid_to_graph
  - img_to_graph
  - reconstruct_from_patches_2d
  - CountVectorizer
  - HashingVectorizer
  - TfidfTransformer
  - TfidfVectorizer
- `feature_selection`
  - GenericUnivariateSelect
  - RFE
  - RFECV
  - SelectFdr
  - SelectFpr
  - SelectFromModel
  - SelectFwe
  - SelectKBest
  - SelectPercentile
  - SelectorMixin
  - SequentialFeatureSelector
  - VarianceThreshold
  - chi2
  - f_classif
  - f_regression
  - mutual_info_classif
  - mutual_info_regression
  - r_regression
- `gaussian_process`
  - GaussianProcessClassifier
  - GaussianProcessRegressor
  - CompoundKernel
  - ConstantKernel
  - DotProduct
  - ExpSineSquared
  - Exponentiation
  - Hyperparameter
  - Kernel
  - Matern
  - PairwiseKernel
  - Product
  - RBF
  - RationalQuadratic
  - Sum
  - WhiteKernel
- `impute`
  - IterativeImputer
  - KNNImputer
  - MissingIndicator
  - SimpleImputer
- `inspection`
  - partial_dependence
  - permutation_importance
  - DecisionBoundaryDisplay
  - PartialDependenceDisplay
- `isotonic`
  - IsotonicRegression
  - check_increasing
  - isotonic_regression
- `kernel_approximation`
  - AdditiveChi2Sampler
  - Nystroem
  - PolynomialCountSketch
  - RBFSampler
  - SkewedChi2Sampler
- `kernel_ridge`
  - KernelRidge
- `linear_model`
  - LogisticRegression
  - LogisticRegressionCV
  - PassiveAggressiveClassifier
  - Perceptron
  - RidgeClassifier
  - RidgeClassifierCV
  - SGDClassifier
  - SGDOneClassSVM
  - LinearRegression
  - Ridge
  - RidgeCV
  - SGDRegressor
  - ElasticNet
  - ElasticNetCV
  - Lars
  - LarsCV
  - Lasso
  - LassoCV
  - LassoLars
  - LassoLarsCV
  - LassoLarsIC
  - OrthogonalMatchingPursuit
  - OrthogonalMatchingPursuitCV
  - ARDRegression
  - BayesianRidge
  - MultiTaskElasticNet
  - MultiTaskElasticNetCV
  - MultiTaskLasso
  - MultiTaskLassoCV
  - HuberRegressor
  - QuantileRegressor
  - RANSACRegressor
  - TheilSenRegressor
  - GammaRegressor
  - PoissonRegressor
  - TweedieRegressor
  - PassiveAggressiveRegressor
  - enet_path
  - lars_path
  - lars_path_gram
  - lasso_path
  - orthogonal_mp
  - orthogonal_mp_gram
  - ridge_regression
- `manifold`
  - Isomap
  - LocallyLinearEmbedding
  - MDS
  - SpectralEmbedding
  - TSNE
  - locally_linear_embedding
  - smacof
  - spectral_embedding
  - trustworthiness
- `metrics`
  - check_scoring
  - get_scorer
  - get_scorer_names
  - make_scorer
  - accuracy_score
  - auc
  - average_precision_score
  - balanced_accuracy_score
  - brier_score_loss
  - class_likelihood_ratios
  - classification_report
  - cohen_kappa_score
  - confusion_matrix
  - d2_log_loss_score
  - dcg_score
  - det_curve
  - f1_score
  - fbeta_score
  - hamming_loss
  - hinge_loss
  - jaccard_score
  - log_loss
  - matthews_corrcoef
  - multilabel_confusion_matrix
  - ndcg_score
  - precision_recall_curve
  - precision_recall_fscore_support
  - precision_score
  - recall_score
  - roc_auc_score
  - roc_curve
  - top_k_accuracy_score
  - zero_one_loss
  - d2_absolute_error_score
  - d2_pinball_score
  - d2_tweedie_score
  - explained_variance_score
  - max_error
  - mean_absolute_error
  - mean_absolute_percentage_error
  - mean_gamma_deviance
  - mean_pinball_loss
  - mean_poisson_deviance
  - mean_squared_error
  - mean_squared_log_error
  - mean_tweedie_deviance
  - median_absolute_error
  - r2_score
  - root_mean_squared_error
  - root_mean_squared_log_error
  - coverage_error
  - label_ranking_average_precision_score
  - label_ranking_loss
  - adjusted_mutual_info_score
  - adjusted_rand_score
  - calinski_harabasz_score
  - contingency_matrix
  - pair_confusion_matrix
  - completeness_score
  - davies_bouldin_score
  - fowlkes_mallows_score
  - homogeneity_completeness_v_measure
  - homogeneity_score
  - mutual_info_score
  - normalized_mutual_info_score
  - rand_score
  - silhouette_samples
  - silhouette_score
  - v_measure_score
  - consensus_score
  - DistanceMetric
  - additive_chi2_kernel
  - chi2_kernel
  - cosine_distances
  - cosine_similarity
  - distance_metrics
  - euclidean_distances
  - haversine_distances
  - kernel_metrics
  - laplacian_kernel
  - linear_kernel
  - manhattan_distances
  - nan_euclidean_distances
  - paired_cosine_distances
  - paired_distances
  - paired_euclidean_distances
  - paired_manhattan_distances
  - pairwise_kernels
  - polynomial_kernel
  - rbf_kernel
  - sigmoid_kernel
  - pairwise_distances
  - pairwise_distances_argmin
  - pairwise_distances_argmin_min
  - pairwise_distances_chunked
  - ConfusionMatrixDisplay
  - DetCurveDisplay
  - PrecisionRecallDisplay
  - PredictionErrorDisplay
  - RocCurveDisplay
- `mixture`
  - BayesianGaussianMixture
  - GaussianMixture
- `model_selection`
  - GroupKFold
  - GroupShuffleSplit
  - KFold
  - LeaveOneGroupOut
  - LeaveOneOut
  - LeavePGroupsOut
  - LeavePOut
  - PredefinedSplit
  - RepeatedKFold
  - RepeatedStratifiedKFold
  - ShuffleSplit
  - StratifiedGroupKFold
  - StratifiedKFold
  - StratifiedShuffleSplit
  - TimeSeriesSplit
  - check_cv
  - train_test_split
  - GridSearchCV
  - HalvingGridSearchCV
  - HalvingRandomSearchCV
  - ParameterGrid
  - ParameterSampler
  - RandomizedSearchCV
  - FixedThresholdClassifier
  - TunedThresholdClassifierCV
  - cross_val_predict
  - cross_val_score
  - cross_validate
  - learning_curve
  - permutation_test_score
  - validation_curve
  - LearningCurveDisplay
  - ValidationCurveDisplay
- `multiclass`
  - OneVsOneClassifier
  - OneVsRestClassifier
  - OutputCodeClassifier
- `multioutput`
  - ClassifierChain
  - MultiOutputClassifier
  - MultiOutputRegressor
  - RegressorChain
- `naive_bayes`
  - BernoulliNB
  - CategoricalNB
  - ComplementNB
  - GaussianNB
  - MultinomialNB
- `neighbors`
  - BallTree
  - KDTree
  - KNeighborsClassifier
  - KNeighborsRegressor
  - KNeighborsTransformer
  - KernelDensity
  - LocalOutlierFactor
  - NearestCentroid
  - NearestNeighbors
  - NeighborhoodComponentsAnalysis
  - RadiusNeighborsClassifier
  - RadiusNeighborsRegressor
  - RadiusNeighborsTransformer
  - kneighbors_graph
  - radius_neighbors_graph
  - sort_graph_by_row_values
- `neural_network`
  - BernoulliRBM
  - MLPClassifier
  - MLPRegressor
- `pipeline`
  - FeatureUnion
  - Pipeline
  - make_pipeline
  - make_union
- `preprocessing`
  - Binarizer
  - FunctionTransformer
  - KBinsDiscretizer
  - KernelCenterer
  - LabelBinarizer
  - LabelEncoder
  - MaxAbsScaler
  - MinMaxScaler
  - MultiLabelBinarizer
  - Normalizer
  - OneHotEncoder
  - OrdinalEncoder
  - PolynomialFeatures
  - PowerTransformer
  - QuantileTransformer
  - RobustScaler
  - SplineTransformer
  - StandardScaler
  - TargetEncoder
  - add_dummy_feature
  - binarize
  - label_binarize
  - maxabs_scale
  - minmax_scale
  - normalize
  - power_transform
  - quantile_transform
  - robust_scale
  - scale
- `random_projection`
  - GaussianRandomProjection
  - SparseRandomProjection
  - johnson_lindenstrauss_min_dim
- `semi_supervised`
  - LabelPropagation
  - LabelSpreading
  - SelfTrainingClassifier
- `svm`
  - LinearSVC
  - LinearSVR
  - NuSVC
  - NuSVR
  - OneClassSVM
  - SVC
  - SVR
  - l1_min_c
- `tree`
  - DecisionTreeClassifier
  - DecisionTreeRegressor
  - ExtraTreeClassifier
  - ExtraTreeRegressor
  - export_graphviz
  - export_text
  - plot_tree
- `utils`
  - Bunch
  - _safe_indexing
  - as_float_array
  - assert_all_finite
  - deprecated
  - estimator_html_repr
  - gen_batches
  - gen_even_slices
  - indexable
  - murmurhash3_32
  - resample
  - safe_mask
  - safe_sqr
  - shuffle
  - Tags
  - InputTags
  - TargetTags
  - ClassifierTags
  - RegressorTags
  - TransformerTags
  - get_tags
  - check_X_y
  - check_array
  - check_consistent_length
  - check_random_state
  - check_scalar
  - check_is_fitted
  - check_memory
  - check_symmetric
  - column_or_1d
  - has_fit_parameter
  - validate_data
  - available_if
  - compute_class_weight
  - compute_sample_weight
  - is_multilabel
  - type_of_target
  - unique_labels
  - density
  - fast_logdet
  - randomized_range_finder
  - randomized_svd
  - safe_sparse_dot
  - weighted_mode
  - incr_mean_variance_axis
  - inplace_column_scale
  - inplace_csr_column_scale
  - inplace_row_scale
  - inplace_swap_column
  - inplace_swap_row
  - mean_variance_axis
  - inplace_csr_row_normalize_l1
  - inplace_csr_row_normalize_l2
  - single_source_shortest_path_length
  - sample_without_replacement
  - min_pos
  - MetadataRequest
  - MetadataRouter
  - MethodMapping
  - get_routing_for_object
  - process_routing
  - all_displays
  - all_estimators
  - all_functions
  - check_estimator
  - parametrize_with_checks
  - estimator_checks_generator
  - Parallel
  - delayed

# Analysis

## Data loading

In [None]:
df = pd.read_csv('filepath', sep=',')  # header=0 is default
# or
df = pd.read_csv('filepath', sep=',', names=['feature1_name', 'feature2_name', 'feature3_name'], header=None)
# or
df = pd.read_csv('filepath', sep=',', index_col=0)  # takes first column as index

## Rows, cols and null data

In [None]:
print('Shape:', df.shape)
print('Rows (instances):', df.shape[0])
print('Cols (features + target):', df.shape[1])

In [None]:
df.isnull().sum()
# or
df.isnull().sum().sum()

In [None]:
# Remove rows which contain missing values
df = df.dropna()

In [None]:
df = df.fillna(df.mean())
# In each column the substitution is done with that feature's mean.

## Check balance of target class

In [None]:
df['target'].value_counts()

## Mapping

In [None]:
mapping = {'yes': 1, 'no': 0}

df['feature'] = df['feature'].map(mapping)
# or
df['feature'] = df['feature'].str.lower().map(mapping)

## Discretization and Binning

In [None]:
ages = [20, 22, 25, 61, 27, 21, 23, 37, 31, 59, 45, 41, 32]
bins = [18, 25, 35, 60, 100] # 5 values => 4 bins/intervals
groups_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

binned_ages = pd.cut(ages, bins)
# or
binned_ages = pd.cut(ages, bins, right=False) # default intervals are (18,25], with right=False they become [18,25)
# or
binned_ages = pd.cut(ages, bins, labels=groups_names)
# or
binned_ages = pd.cut(ages, 4) # if integer given, it takes minimum and maximum values and divides that interval in equal sized bins (4 in this case)

binned_ages.codes # returns indexes of the bins, only if binned_ages is of type Categorical

binned_ages.value_counts()

In [None]:
# So to bin a numerical feature
df['feature'] = pd.cut(df['feature'], 4)
# if you want to assign labels
df['feature'] = pd.cut(df['feature'], 4, labels=groups_names)

## Encodings

In [None]:
# Encoding categorical features with Ordinal Encoder
cat_features = [col for col in df.columns if df[col].dtype == object]
ord_enc = preprocessing.OrdinalEncoder()
ord_enc.fit(df.loc[:, cat_features])
enc_features = ord_enc.transform(df.loc[:, cat_features])
df.loc[:, cat_features] = enc_features

# Plot

### Plot in a bar chart the value distribution of the target column

In [None]:
# with pandas
df['target'].value_counts().plot(kind='bar')
# with seaborn
sns.countplot(x = df['target'])

# Transformation and Prediction

## X and y

In [None]:
X = df.drop(['target_col'], axis=1)
y = df['target_col']

## Train/test split

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)  # default test_size=0.25, train_size=0.75
# or
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.4)
# or
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, stratify=y) # to keep the same distribution of target

## Preprocessing

### Standardization

In [None]:
# Default axis=0, standardize each feature.
scaler = preprocessing.StandardScaler()         # After, the features have mean=0 and stddev=1
scaler = preprocessing.MinMaxScaler((0, 1))     # Scales features to a range
scaler = preprocessing.RobustScaler()           # If data contains many outliers, this one performs better

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Normalization

In [None]:
# Default axis=1, normalize each sample.
normalizer = preprocessing.Normalizer() # default 'l2'

X_train = normalizer.fit_transform(X_train)
X_test = normalizer.transform(X_test)

### Encodings

In [None]:
enc = preprocessing.OneHotEncoder(handle_unknown='ignore')              # default 'error'
enc = preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value')  # default 'error'

names_cols = X_train.columns

X_train = enc.fit_transform(X_train)
X_test = enc.transform(X_test)

X_train = pd.DataFrame(X_train, columns=names_cols)

In [None]:
# Label encoder only works with 1-d arrays
categorical_cols = [col for col in df.columns if df[col].dtype == object]
enc = preprocessing.LabelEncoder()
for col in categorical_cols:
    df[col] = enc.fit_transform(df[col])

### Discretization

In [None]:
discr = preprocessing.KBinsDiscretizer(n_bins=5)
discr = preprocessing.KBinsDiscretizer(n_bins=[4, 3, 5]) # 4 bins for feature1, 3 bins for feature2 and 5 bins for feature3 of input X. Input X must have 3 features.
discr = preprocessing.KBinsDiscretizer(encode='onehot')
discr = preprocessing.KBinsDiscretizer(strategy='uniform')

X_train = discr.fit_transform(X_train)
X_test = discr.transform(X_test)

In [None]:
# binarizer
discr = preprocessing.Binarizer(threshold=0)

X_train = discr.fit_transform(X_train)
X_test = discr.transform(X_test)

## Model instantiation

In [None]:
# SUPERVISED
# classification
model = linear_model.LogisticRegression(penalty='l2')
model = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform')  # weights='distance'
model = tree.DecisionTreeClassifier(criterion='gini')
model = ensemble.RandomForestClassifier(n_estimators=100, criterion='gini')
model = svm.SVC(kernel='rbf', C=1, gamma='scale')
model = dummy.DummyClassifier(strategy='uniform') # only for testing rules of thumb

# regression
model = linear_model.LinearRegression()
model = neighbors.KNeighborsRegressor(n_neighbors=5, weights='uniform')  # weights='distance'
model = tree.DecisionTreeRegressor(criterion='squared_error')
model = ensemble.RandomForestRegressor(n_estimators=100, criterion='squared_error')
model = dummy.DummyRegressor(strategy='mean') # only for testing rules of thumb

## Training and prediction

In [None]:
model.fit(X=X_train, y=y_train)

In [None]:
y_pred = model.predict(X=X_test)
y_prob = model.predict_proba(X=X_test)

## Grid search

In [None]:
# Example of grid search for SVC classifier. Obviously, parameters change with the model.
parameters = [
                {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
            ]
model = svm.SVC()
scores = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

gridsearch = model_selection.GridSearchCV(estimator=model, param_grid=parameters, scoring=scores, cv=5)

gridsearch.fit(X_train, y_train)
print('CV results\n', gridsearch.cv_results_)
print('Best params:', gridsearch.best_params_)
print('Best estimator:', gridsearch.best_estimator_)
print('Best score:', gridsearch.best_score_)

y_pred = gridsearch.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

## Pipeline

In [None]:
# Pipeline with one hot encoder and standard scaler
X = df.drop(['target_col'], axis=1)
y = df['target_col']
categorical_features = [col for col in X.columns if X[col].dtype == object]
numerical_features = [col for col in X.columns if col not in categorical_features]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

coltran = compose.ColumnTransformer(transformers=[("onehot", preprocessing.OneHotEncoder(handle_unknown='ignore'), categorical_features),
                                                  ("std", preprocessing.StandardScaler(), numerical_features)],
                                                  remainder='passthrough')

my_pipeline = pipeline.Pipeline(steps=[("coltran", coltran),
                                       ("estimator", ensemble.RandomForestRegressor())])

my_pipeline.fit(X=X_train, y=y_train)

y_pred = my_pipeline.predict(X=X_train)

In [None]:
# Pipeline with k bins discretizer and standard scaler
coltran = compose.ColumnTransformer(transformers=[('discr', preprocessing.KBinsDiscretizer(n_bins=5), ['feature1', 'feature3', 'feature4']),
                                                  ('std', preprocessing.StandardScaler(), ['feature5', 'feature6'])],
                                    remainder='passthrough')

my_pipeline = pipeline.Pipeline(steps=[('coltran', coltran),
                                    ('estimator', ensemble.RandomForestClassifier())])

my_pipeline.fit(X=X_train, y=y_train)

y_pred = my_pipeline.predict(X=X_test)

In [None]:
# Example of pipeline
coltran = compose.ColumnTransformer(transformers=[('discr', preprocessing.KBinsDiscretizer(n_bins=5), ['feature1', 'feature3', 'feature4']),
                                                  ('std', preprocessing.StandardScaler(), ['feature5', 'feature6'])],
                                    remainder='passthrough')

my_pipeline = pipeline.Pipeline(steps=[('coltran', coltran),
                                    ('estimator', ensemble.RandomForestRegressor())])

my_pipeline.fit(X=X_train, y=y_train)

y_pred = my_pipeline.predict(X=X_test)

### Grid search with a pipeline

In [None]:
# Example of grid search with a pipeline
# Select categorical columns
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]
# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

# Preprocessing for numerical data
numerical_transformer = impute.SimpleImputer(strategy='most_frequent')
# Preprocessing for categorical data
categorical_transformer = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='most_frequent')),
    ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore', sparse_output = False))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = compose.ColumnTransformer(
    transformers=[
       ('num', numerical_transformer, numerical_cols),
       ('cat', categorical_transformer, categorical_cols)
    ])

model = ensemble.RandomForestRegressor(n_estimators=10, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = pipeline.Pipeline(steps=[
                              ('preprocessor', preprocessor),
                              ('model', model),
                             ])

parameters = {
    'model__n_estimators': [1,5,10],
    'preprocessor__num__strategy': ['most_frequent','constant','mean'],
    'preprocessor__cat__imputer__strategy': ['most_frequent','constant'],
}

gridsearch = model_selection.GridSearchCV(my_pipeline, parameters, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

gridsearch.fit(X_train, y_train)

print(gridsearch.best_params_)
print(gridsearch.best_score_)

## Statistics

### Classification

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Balanced Accuracy:", metrics.balanced_accuracy_score(y_test, y_pred))
print("Brier Score Loss:", metrics.brier_score_loss(y_test, y_prob))
print("Neg Log Loss:", metrics.log_loss(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("Jaccard:", metrics.jaccard_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred))

print('Roc Auc Score:', metrics.roc_auc_score(y_test, y_prob))

# print('example: {:0.2f}'.format(statistic))

In [None]:
# Roc Curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)
print(fpr, tpr, thresholds)

plt.plot(fpr, tpr,'r-',label = 'Test')
#plt.plot([0,1],[0,1],'k-',label='Random')
#plt.plot([0,0,1,1],[0,1,1,1],'g-',label='Perfect')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
# Confusion Matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

tp, fn, fp, tn = cnf_matrix.ravel()
print('True Positive:', tp)
print('True Negative:', tn)
print('False Positive:', fp)
print('False Negative:', fn)

# plot
sns.heatmap(cnf_matrix,annot=True,fmt="d")

### Regression

In [None]:
print('Explained variance score:', metrics.explained_variance_score(y_test, y_pred))
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', metrics.root_mean_squared_error(y_test, y_pred))
print('MSLE:', metrics.mean_squared_log_error(y_test, y_pred))
print('MedAE:', metrics.median_absolute_error(y_test, y_pred))
print('R2 Score:', metrics.r2_score(y_test, y_pred))