# **Stacking Algorithm Implementation**


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

In [None]:
#Load the data (Breast Cancer Dataset)
#holdout -> validation set
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)

data=load_breast_cancer()
data_df = pd.DataFrame(data = data.data,
                       columns = data.feature_names)

X_train, X_rem, y_train, y_rem = train_test_split(X, y, random_state=97, train_size=0.6)

X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, random_state=97, test_size=0.3)

print(X.size)

17070


In [None]:
# individual learners of the model

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

models = dict()
# preds = list()
models['lr'] = LogisticRegression(max_iter=100000)
models['cart'] = DecisionTreeClassifier()
models['bayes'] = GaussianNB()



for model in models:
  models[model].fit(X_train,y_train)

In [None]:
# Creating a dataframe in which each column represents predicted values of predictors
#Predicting output for holdout(validation) set

# for model in models:
pred1 = models['lr'].predict(X_valid)
pred2 = models['cart'].predict(X_valid)
pred3 = models['bayes'].predict(X_valid)

test_preds1 = models['lr'].predict(X_test)
test_preds2 = models['cart'].predict(X_test)
test_preds3 = models['bayes'].predict(X_test)

data_df_new = pd.DataFrame(data = X_valid,
                       columns = data.feature_names)
data_df_new['lr'] = pred1
data_df_new['cart'] = pred2
data_df_new['bayes'] = pred3


print(data_df_new.info())
print(data_df_new.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              159 non-null    float64
 1   mean texture             159 non-null    float64
 2   mean perimeter           159 non-null    float64
 3   mean area                159 non-null    float64
 4   mean smoothness          159 non-null    float64
 5   mean compactness         159 non-null    float64
 6   mean concavity           159 non-null    float64
 7   mean concave points      159 non-null    float64
 8   mean symmetry            159 non-null    float64
 9   mean fractal dimension   159 non-null    float64
 10  radius error             159 non-null    float64
 11  texture error            159 non-null    float64
 12  perimeter error          159 non-null    float64
 13  area error               159 non-null    float64
 14  smoothness error         1

In [None]:
print(data_df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst texture  worst perimeter  \
0           

In [None]:

train_stack = np.column_stack((pred1,pred2,pred3))
test_stack = np.column_stack((test_preds1,test_preds2,test_preds3))

In [None]:
#blender
final_model = LogisticRegression(max_iter=100000)

final_model.fit(train_stack,y_valid)

LogisticRegression(max_iter=100000)

In [None]:
final_predictions = final_model.predict(test_stack)

In [None]:
#Check accuracy on Test Set
#Show classification report

from sklearn import metrics

print("Accuracy: ",metrics.accuracy_score(y_test, final_predictions))
print("Precision: ",metrics.precision_score(y_test, final_predictions))
print("Recall: ",metrics.recall_score(y_test, final_predictions))

Accuracy:  0.9855072463768116
Precision:  0.9791666666666666
Recall:  1.0


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

def model_Evaluate(model, y_test, final_predictions):
# Predict values for Test dataset
  # y_pred = model.predict(X_test)
  # Print the evaluation metrics for the dataset.
  print(classification_report(y_test, final_predictions))
  # Compute and plot the Confusion matrix
  # cf_matrix = confusion_matrix(y_test, y_pred)
  # categories = ['Negative','Positive']
  # group_names = ['True Neg','False Pos', 'False Neg','True Pos']
  # group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]
  # labels = [f'{v1}n{v2}' for v1, v2 in zip(group_names,group_percentages)]
  # labels = np.asarray(labels).reshape(2,2)
  # sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
  # xticklabels = categories, yticklabels = categories)
  # plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
  # plt.ylabel("Actual values" , fontdict = {'size':14}, labelpad = 10)
  # plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

model_Evaluate(final_model, y_test, final_predictions)

              precision    recall  f1-score   support

           0       1.00      0.95      0.98        22
           1       0.98      1.00      0.99        47

    accuracy                           0.99        69
   macro avg       0.99      0.98      0.98        69
weighted avg       0.99      0.99      0.99        69



# **StackingClassifier from sklearn**
Use StackingClassifier from sklearn to implement the same on cancer dataset.
Bagging and RandomForest

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)

data=load_breast_cancer()
data_df = pd.DataFrame(data = data.data,
                       columns = data.feature_names)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=97, train_size=0.8)


model1 = LogisticRegression(max_iter=100000)
model2 = DecisionTreeClassifier()
model3 = GaussianNB()

print(y_test)
# X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, random_state=97, test_size=0.3)

[1 0 1 1 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 0 0 1 1 0 1 1 1 1 0 0 0 1 0 0 0 0
 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0
 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 1 0 1 0 0 1 1 0 1 0 1 0
 0 0 0]


In [None]:
estimators = [
     ('lr', model1),
     ('cart', model2),
     ('bayes', model3)
]

final_model = LogisticRegression(max_iter=100000)
sclf = StackingClassifier(estimators=estimators,
                            final_estimator=final_model,
                            cv=10)



In [None]:
#Fit Bagging Classifier on Cancer Dataset

sclf.fit(X_train, y_train)




StackingClassifier(cv=10,
                   estimators=[('lr', LogisticRegression(max_iter=100000)),
                               ('cart', DecisionTreeClassifier()),
                               ('bayes', GaussianNB())],
                   final_estimator=LogisticRegression(max_iter=100000))

In [None]:
prediction = sclf.predict(X_test)

In [None]:
print("Accuracy: ",sclf.score(X_test, y_test))
print("Precision: ",metrics.precision_score( y_test, prediction))
print("Accuracy: ",metrics.recall_score( y_test, prediction))

# model_Evaluate(sclf, y_test, prediction)

Accuracy:  0.9649122807017544
Precision:  0.9565217391304348
Accuracy:  0.9850746268656716


# **Adaboost**

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)

data=load_breast_cancer()
data_df = pd.DataFrame(data = data.data,
                       columns = data.feature_names)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=97, train_size=0.8)


model1 = LogisticRegression(max_iter=100000)
model2 = DecisionTreeClassifier()
model3 = GaussianNB()

estimators = [
     ('lr', model1),
     ('cart', model2),
     ('bayes', model3)
]

In [None]:
abc = AdaBoostClassifier(learning_rate=1)
abc.fit(X_train, y_train)


AdaBoostClassifier(learning_rate=1)

In [None]:
prediction = abc.predict(X_test)

In [None]:
from sklearn import metrics

print("Accuracy: ",abc.score(X_test, y_test))
print("Precision: ",metrics.precision_score(y_test, prediction))
print("Recall: ",metrics.recall_score( y_test, prediction))

# model_Evaluate(abc, y_test, prediction)

Accuracy:  0.9473684210526315
Precision:  0.9552238805970149
Recall:  0.9552238805970149


# **Adaboost Regression on concrete_data.csv.**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
datasets = pd.read_csv('/content/drive/MyDrive/CE Sem 6/ML/data/Contrete.csv')


Mounted at /content/drive


In [None]:
# print(datasets['csMPa'])

In [None]:
X = datasets.iloc[:, :-1].values

# Only last column, 0 for 1st column and -1 for last colum,-2 for 2nd last column
y = datasets.iloc[:, -1].values
print("\n\nInput : \n", X)
print("\n\nOutput: \n", y)



Input : 
 [[ 540.     0.     0.  ... 1040.   676.    28. ]
 [ 540.     0.     0.  ... 1055.   676.    28. ]
 [ 332.5  142.5    0.  ...  932.   594.   270. ]
 ...
 [ 148.5  139.4  108.6 ...  892.4  780.    28. ]
 [ 159.1  186.7    0.  ...  989.6  788.9   28. ]
 [ 260.9  100.5   78.3 ...  864.5  761.5   28. ]]


Output: 
 [79.99 61.89 40.27 ... 23.7  32.77 32.4 ]


In [None]:
from sklearn.model_selection import train_test_split

#split data set into train and test sets
X_train, X_test, y_train, y_test = train_test_split(datasets, y, test_size = 0.25, random_state = 97)

# print(y_test)

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

 
#Choosing Decision Tree with 1 level as the weak learner
DTR=DecisionTreeRegressor(max_depth=1)
RegModel = AdaBoostRegressor(n_estimators=50, base_estimator=DTR ,learning_rate=1)

In [None]:
#Creating the model on Training Data
AB=RegModel.fit(X_train,y_train)
y_pred=AB.predict(X_test)

# print(predictions)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# model_Evaluate(AB, y_test, predictions) --> doesn't work for continuous values
print("Accuracy: ",RegModel.score(X_test, y_test))

Accuracy:  0.709418382619627


In [None]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

X,y = load_diabetes(return_X_y=True)

#split data set into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 97)


from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

 
#Choosing Decision Tree with 1 level as the weak learner
DTR=DecisionTreeRegressor(max_depth=10)
RegModel = AdaBoostRegressor(n_estimators=100, base_estimator=DTR ,learning_rate=1)

AB=RegModel.fit(X_train,y_train)
y_pred=AB.predict(X_test)

from sklearn import metrics
from sklearn.metrics import mean_squared_error
# model_Evaluate(AB, y_test, predictions) --> doesn't work for continuous values
print("Accuracy: ",RegModel.score(X_test, y_test))

print("Mean Square Error: ",mean_squared_error(y_test,y_pred))

Accuracy:  0.4162370689717848
Mean Square Error:  3753.21890164292
