In [None]:
import pandas as pd

In [None]:
#Loading data from Github repository

filename = '../Dataset/processed.cleveland.data'



In [None]:
# Loading the data using pandas

heartData = pd.read_csv(filename,sep=",",header = None,na_values = "?")
heartData.head()

In [None]:
heartData.columns = ['age','sex', 'cp', 'trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','label']
heartData.head()

In [None]:
# Changing the Classes to 1 & 0
heartData.loc[heartData['label'] > 0 , 'label'] = 1

heartData.head()


In [None]:
# Dropping all the rows with na values
newheart = heartData.dropna(axis = 0)
newheart.shape

In [None]:
# Seperating X and y variables

y = newheart.pop('label')
y.shape

In [None]:
X = newheart
X.head()

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

**Creating processing Engine**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [None]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns


In [None]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

**Spot checking different models**

In [None]:
# Importing necessary libraries
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [None]:
# Creating a list of the classifiers
classifiers = [
    KNeighborsClassifier(),     
    RandomForestClassifier(random_state=123),
    AdaBoostClassifier(random_state=123),
    LogisticRegression(random_state=123)
    ]

In [None]:
# Looping through classifiers to get the best model
for classifier in classifiers:
    estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dimred', PCA(10)),
                           ('classifier',classifier)])
    estimator.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.2f" % estimator.score(X_test, y_test))

**Grid Search**

In [None]:
# Creating a pipeline with Logistic Regression
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dimred', PCA()),
                           ('classifier',LogisticRegression(random_state=123))])



In [None]:

param_grid =  {'dimred__n_components':[10,11,12,13],'classifier__penalty' : ['l1', 'l2'],'classifier__C' : [1,3, 5],'classifier__solver' : ['liblinear']}
    

In [None]:
from sklearn.model_selection import GridSearchCV
# Fitting the grid search
estimator = GridSearchCV(pipe, cv=10, param_grid=param_grid)


In [None]:
# Fitting the estimator on the training set
estimator.fit(X_train,y_train)


In [None]:
# Printing the best score and best parameters
print("Best: %f using %s" % (estimator.best_score_, 
    estimator.best_params_))

In [None]:
# Predicting with the best estimator
pred = estimator.predict(X_test)

In [None]:
# Printing the classification report
from sklearn.metrics import classification_report

print(classification_report(pred,y_test))