In [None]:
import pandas as pd

In [None]:
#Loading data from the Github repository to colab notebook
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter15/Dataset/crx.data'



In [None]:
# Loading the data using pandas

credData = pd.read_csv(filename,sep=",",header = None,na_values = "?")
credData.head()

In [None]:
# Changing the Classes to 1 & 0
credData.loc[credData[15] == '+' , 15] = 1
credData.loc[credData[15] == '-' , 15] = 0
credData.head()

In [None]:
# Dropping all the rows with na values
newcred = credData.dropna(axis = 0)
newcred.shape

In [None]:
# Seperating X and y variables

X = newcred.loc[:,0:14]
X.shape

In [None]:
y = newcred.loc[:,15]
y.head()

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

**Creating processing Engine**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [None]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [None]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

**Modelling and prediction with pipeline**

In [None]:
# Importing necessary libraries
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression


In [None]:
# Creating the estimator pipeline for model building
estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dimred', PCA(10)),
                           ('clf',LogisticRegression(random_state=123))])

In [None]:
# Fitting the modelling pipeline on the training set
estimator.fit(X_train,y_train)


In [None]:
# Creating the score on the test set
estimator.score(X_test, y_test)

In [None]:
# Generating the predictions on test set
pred = estimator.predict(X_test)

In [None]:
# Printing the classification report
from sklearn.metrics import classification_report

print(classification_report(pred,y_test))

In [None]:
# Generating confusion matrix
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

