# Generating toy dataset

In [433]:
from sklearn.datasets import make_classification

In [408]:
X, y = make_classification(n_samples=100,n_features=5, n_classes=3, n_clusters_per_class=1)

# Making it into a pandas DataFrame

In [409]:
import pandas as pd
df = pd.DataFrame(X, columns=['A', 'B', 'C', 'D', 'E'])

# Putting a random Categorical column feature

In [410]:
import numpy as np
colors = ["red", "green", "blue"]
color_categry = np.random.choice(colors, size=100).reshape(-1, 1)
df['colors'] = color_categry

# Joining the target column to the DF

In [411]:
df['target'] = y
df['target'] = df['target'].replace({
    0: "cat",
    1: "dog",
    2: "bird"
})

# Making first 20 values in "A" column an NA values

In [432]:
df['A'][0:20] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['A'][0:20] = np.nan


# Train Test split

In [413]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(['target'], axis=1), df['target'],
                                                    test_size=0.2, random_state=42)

# Creating ColumnTransformer to:

1. Pipeline in Numeric columns like A,B,C,D and E:    
    1. Using SimpleImputer() to handle NA values 
    2. Applying StandardScaler 
2. Applying OneHotEncoder on Categorical column lke "colors"

In [434]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

preprocessing = ColumnTransformer(transformers=[
    #  Pipeline in Numeric columns like A,B,C,D and E
    ("num_col", Pipeline([
        # Using SimpleImputer() to handle NA values 
         ("imputer_fornumeric", SimpleImputer(strategy="mean")),
        # Applying StandardScaler
         ('scaling', StandardScaler()),
    ]), ['A', 'B', 'C', 'D', 'E']),
    
    #  Applying OneHotEncoder on Categorical column lke "colors"
    ('category', OneHotEncoder(sparse=False, drop="first"), ['colors'])
])

# Creating main Pipeline to preprocess the ColumnTransformer and model selection (Logistic Regression)

In [435]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessing),
    ("model", LogisticRegression())
])

# Training the model

In [436]:
pipeline.fit(X_train, y_train)



In [437]:
pipeline.score(X_test,y_test)

0.95

In [441]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predict)

0.95

In [439]:
predict = pipeline.predict(X_test)

In [440]:
np.sum(predict != y_test)

1