# Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

# Load Data and Preprocess

In [2]:
#reading the dataset
df=pd.read_csv("data_updated.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,0,5,1,1,1,2,1,3,1,1,0
1,1,5,4,4,5,7,10,3,2,1,0
2,2,3,1,1,1,2,2,3,1,1,0
3,3,6,8,8,1,3,4,3,7,1,0
4,4,4,1,1,3,2,1,3,1,1,0


In [3]:
# cols = df.columns

# for i in cols:
#     if df[i].isnull().sum() != 0:
#         print("Column name is:", i)
#         print(df[i].isnull().sum())

# #filling missing values
# print(df['Gender'].value_counts())
# df['Gender'].fillna('Male', inplace=True)

# print(df['Married'].value_counts())
# df['Married'].fillna('Yes', inplace=True)

# print(df['Dependents'].value_counts())
# df['Dependents'].fillna('0', inplace=True)

# print(df['Self_Employed'].value_counts())
# df['Self_Employed'].fillna('No', inplace=True)

# print(df.LoanAmount.describe())
# df['LoanAmount'].fillna(df.LoanAmount.mean(), inplace = True)

# print(df['Loan_Amount_Term'].value_counts())
# df['Loan_Amount_Term'].fillna(512, inplace=True)

# print(df['Credit_History'].value_counts())
# df['Credit_History'].fillna(1.0, inplace=True)

# # Get categorical columns
# cat_cols = []
# for i in cols:
#     if df[i].dtypes == 'object' and i!= 'Loan_ID':
#         print(i)
#         cat_cols.append(i)

# # Do label encoding for categorical columns
# le = LabelEncoder()
# for i in cat_cols:
#     df[i] = le.fit_transform(df[i])

# #split dataset into train and test
train, test = train_test_split(df, test_size=0.3, random_state=0)

x_train=train.drop(['Class'], axis=1)
y_train=train['Class']

x_test=test.drop(['Class'], axis=1)
y_test=test['Class']

In [4]:
model1 = KNeighborsClassifier()
model1.fit(x_train,y_train)
model1.score(x_test,y_test)

0.9708029197080292

In [5]:
model2 = SGDClassifier(random_state=1)
model2.fit(x_train,y_train)
model2.score(x_test,y_test)

0.8759124087591241

In [6]:
model3 = GaussianNB()
model3.fit(x_train,y_train)
model3.score(x_test,y_test)

0.9416058394160584

# MaxVoting 

In [7]:
model1 = KNeighborsClassifier()
model2 = SGDClassifier(random_state=1)
model3 = GaussianNB()
model = VotingClassifier(estimators=[('lr', model1), ('dt', model2), ('NB', model3)])
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.9562043795620438

# Weighted Averaging 

In [10]:
model1 = KNeighborsClassifier()
model2 = SGDClassifier(random_state=1, loss="modified_huber", max_iter=10, penalty="l1")
model3 = GaussianNB()

model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
model3.fit(x_train,y_train)

pred1 = model1.predict_proba(x_test)
pred2 = model2.predict_proba(x_test)
pred3 = model3.predict_proba(x_test)

weighted_prediction = (0.5*pred1)+(0.2*pred2)+(0.3*pred3)
labelprediction = np.argmax(weighted_prediction, axis = 1)

accuracy_score(labelprediction, y_test)

0.9708029197080292

In [11]:
model1 = KNeighborsClassifier()
model2 = SGDClassifier(random_state=1, loss="modified_huber", max_iter=10, penalty="l1")
model3 = GaussianNB()
model = VotingClassifier(estimators=[('dt', model1), ('gnb', model2), ('lr', model3)],voting='soft',weights=[0.5,0.2,0.3])
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.9708029197080292

# Power Averaging

In [None]:
model1 = SGDClassifier(random_state=1, loss="modified_huber", max_iter=10, penalty="l1")
model2 = GaussianNB()
model3= KNeighborsClassifier()

model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
model3.fit(x_train,y_train)

pred1 = model1.predict_proba(x_test)
pred2 = model2.predict_proba(x_test)
pred3 = model3.predict_proba(x_test)

weighted_prediction = ((pred1**2)+(pred2**2)+(pred3**2))/3
labelprediction = np.argmax(weighted_prediction, axis = 1)
from sklearn.metrics import roc_auc_score, r2_score
accuracy_score(labelprediction, y_test)
# weighted_prediction

# Stacking

Consider the following data. The train data has been divided into 4-folds

<img src="https://miro.medium.com/max/660/1*yesnizWjGSNGsUmlkhX18w.png">

We will be building an ensemble of 3 models. Each model will be trained on any 3 folds of the training data, and will make prediction on the 4th fold.

<img src="https://miro.medium.com/max/624/1*yYFpm4Duauymbqmcs7pqTA.png">

This process will be repeated for all possible combinations of 3 folds, which will give a prediction for the entire training data. This is shown in the below image.

<img src="https://miro.medium.com/max/624/1*zpYK59ERadLpks69gxANAw.png">

Similarly, we will do the same thing with 2 more models, and finally we will ahve 3 different sets of predictions on the entire training data, given by the 3 models we choose initially.

<img src="https://miro.medium.com/max/624/1*Cuwvlt9nEh70o9RzFSFrTQ.png">

Finally, a meta-model will be trained with these predictions as features and the original target variable.

<img src="https://miro.medium.com/max/624/1*BhSO1IVsXtbfMN6uIqKydA.png">

With stacking, the predictions of the meta-model on the test set will be better than the predictions of any of the 3 models alone on the test set.

In [12]:
def Stacking(model,train,y,test,n_fold):
    folds=StratifiedKFold(n_splits=n_fold,random_state=1)
    test_pred=np.empty((test.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    
    for train_indices,val_indices in folds.split(train,y.values):
        x_train,x_val=train.iloc[train_indices],train.iloc[val_indices]
        y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]

        model.fit(X=x_train,y=y_train)
        train_pred=np.append(train_pred,model.predict(x_val))
        
    model.fit(train, y)
    test_pred=model.predict(test)
    return test_pred.reshape(-1,1),train_pred

In [13]:
model1 = SGDClassifier(random_state=1, loss="modified_huber", max_iter=10, penalty="l1")

test_pred1 ,train_pred1=Stacking(model=model1,n_fold=10, train=x_train,test=x_test,y=y_train)
accuracy_score(test_pred1, y_test)

0.9197080291970803

In [14]:
train_pred1=pd.DataFrame(train_pred1)
test_pred1=pd.DataFrame(test_pred1)


In [20]:
model2 = GaussianNB()

test_pred2 ,train_pred2=Stacking(model=model2,n_fold=10,train=x_train,test=x_test,y=y_train)
accuracy_score(test_pred2, y_test)

0.9247648902821317

In [16]:
train_pred2=pd.DataFrame(train_pred2)
test_pred2=pd.DataFrame(test_pred2)

In [17]:
df = pd.concat([train_pred1, train_pred2y_train], axis=1)
df_test = pd.concat([test_pred1, test_pred2], axis=1)

model = KNeighborsClassifier()
model.fit(df,y_train)
model.score(df_test, y_test)

0.9708029197080292

In [18]:
stack = StackingClassifier(estimators=[('dt',model1),('lr',model2)], final_estimator=model,cv=10)
stack.fit(x_train,y_train)
stack.score(x_test,y_test)

0.9708029197080292

##Another Way to Stack your models
There is another library, called mlxtend that offers the Stacking Classifier.

Here are the links to these:
1.    [StackingClassifier](http://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/)
2.    [StackingClassifierCV](http://rasbt.github.io/mlxtend/user_guide/classifier/StackingCVClassifier/)

# Blending 

Blending is very similar to stacking, the only difference is that here we do not consider folds of training data, we rather split the training data into train and validation sets. This set is called the holdout set.
<img src="https://miro.medium.com/max/624/1*5RXUF92qwpx-1BkcTejIoQ.png">

Rest of the process is same as that in stacking, we train 3 models on the train data, and each of them makes a prediction on the holdout set. These predictions are then used to train the meta-model.

<img src="https://miro.medium.com/max/624/1*pvGUnMAycqvsYAwVe2LpHw.png">

In [21]:
train, test = train_test_split(train, test_size=0.2, random_state=0)

x_train=train.drop(['Class'], axis=1)
y_train=train['Class']

x_val=test.drop(['Class'], axis=1)
y_val=test['Class']

x_val = x_val.reset_index(drop = True)
x_test = x_test.reset_index(drop = True)

model1 = SGDClassifier(random_state=1, loss="modified_huber", max_iter=10, penalty="l1")
model1.fit(x_train, y_train)

val_pred1=model1.predict(x_val)
test_pred1=model1.predict(x_test)

val_pred1=pd.DataFrame(val_pred1)
test_pred1=pd.DataFrame(test_pred1)

model2 = GaussianNB()
model2.fit(x_train,y_train)

val_pred2=model2.predict(x_val)
test_pred2=model2.predict(x_test)

val_pred2=pd.DataFrame(val_pred2)
test_pred2=pd.DataFrame(test_pred2)

In [22]:
df_val = pd.concat([x_val, val_pred1,val_pred2],axis=1)
df_test = pd.concat([x_test, test_pred1,test_pred2],axis=1)

model = LogisticRegression(random_state=1)
model.fit(df_val,y_val)
model.score(df_test,y_test)

0.9781021897810219

# Stacking vs Blending

The benefit of blending over stacking is that there is absolute no chance of any information leak here. Whereas in stacking, since we are training the base models on the entire train data, this method is not very robust against information leak.

Although both give near similar results, the choice of which approach to take is more of a developer's personal choice. There is no clear demarcation which to use when, and which is better than the other.

#Bagging and Boosting

Prof Raghavan covered these topics extensivel, we will not be going over them again.

These are a few algorithms based on Bagging and Boosting:


1.   Random Forest
2.   XGBoost
3.   AdaBoost
4.   CatBoost
5.   GBM
6.   LightGBM

If you have any queries, we are happy to take them up.




# Resources



1.   [MLWave Article for Ensembling](https://mlwave.com/kaggle-ensembling-guide)
2.   [Medium Article for Stacking and Blending](https://medium.com/@stevenyu530_73989/stacking-and-blending-intuitive-explanation-of-advanced-ensemble-methods-46b295da413c)
3.   [TDS Article for Ensembling](https://towardsdatascience.com/ensemble-methods-in-machine-learning-what-are-they-and-why-use-them-68ec3f9fef5f)
4.   [Analytics Vidhya Article for Ensembling](https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/)
5.   [Scikit-Learn Documentation for Stacking Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html#sklearn.ensemble.StackingClassifier)
6.   [Various Averaging Techniques](https://www.kaggle.com/c/siim-isic-melanoma-classification/discussion/165653)
