# Max-Voting

### Alistando el modelo


In [1]:
import os
import pandas as pd

In [2]:
#os.chdir(".../Chapter 2")
#os.getcwd()

#### Download the dataset Cryotherapy.csv from the github location and copy the same to your working directory. Let's read the dataset.

In [3]:
cryotherapy_data = pd.read_csv("https://raw.githubusercontent.com/PacktPublishing/Ensemble-Machine-Learning-Cookbook/master/Chapter02/Cryotherapy.csv")

#### Let's take a glance at the data with the below code:

In [4]:
cryotherapy_data.head(91)

Unnamed: 0,sex,age,Time,Number_of_Warts,Type,Area,Result_of_Treatment
0,1,35,12.00,5,1,100,0
1,1,29,7.00,5,1,96,1
2,1,50,8.00,1,3,132,0
3,1,32,11.75,7,3,750,0
4,1,67,9.25,1,1,42,0
...,...,...,...,...,...,...,...
85,2,34,12.00,3,3,95,0
86,2,20,3.50,6,1,75,1
87,2,35,8.25,8,3,100,0
88,1,24,10.75,10,1,20,1


In [5]:
cryotherapy_data.shape

(90, 7)

### How to do it...

#### We import the required libraries for building the decision tree, support vector machines and logistic regression models. We also import VotingClassifier for max voting

In [6]:
# Import required libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

#### We move onto building our feature set and creating our train & test dataset

In [7]:
# We create train & Test sample from our dataset
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split

# create feature & response variables
feature_columns = ['sex', 'age', 'Time', 'Number_of_Warts', 'Type', 'Area']
X = cryotherapy_data[feature_columns]
Y = cryotherapy_data['Result_of_Treatment']

In [8]:
# Create train & test sets
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size=0.20, random_state=1)

### Hard Voting

#### We build our models with decision tree, support vector machines and logistic regression algorithms

In [9]:
# create the sub models
estimators = []

dt_model = DecisionTreeClassifier(random_state=1)
estimators.append(('DecisionTree', dt_model))

svm_model = SVC(random_state=1)
estimators.append(('SupportVector', svm_model))

logit_model = LogisticRegression(random_state=1)
estimators.append(('Logistic Regression', logit_model))

#dt_model.fit(X_train,Y_train)
#svm_model.fit(X_train,Y_train)
#knn_model.fit(X_train,Y_train)

#### We build individual models with each of the classifiers we have chosen

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

for each_estimator in (dt_model, svm_model, logit_model):
    each_estimator.fit(X_train, Y_train)
    Y_pred = each_estimator.predict(X_test)
    print(each_estimator.__class__.__name__, accuracy_score(Y_test, Y_pred))
    print(confusion_matrix(Y_test, Y_pred))

DecisionTreeClassifier 0.8333333333333334
[[8 2]
 [1 7]]
SVC 0.4444444444444444
[[ 0 10]
 [ 0  8]]
LogisticRegression 0.9444444444444444
[[9 1]
 [0 8]]


#### We proceed to ensemble our models and use VotingClassifier to score accuracy

In [11]:
# Using VotingClassifier() to build ensemble model with Hard Voting
ensemble_model = VotingClassifier(estimators=estimators, voting='hard')
ensemble_model.fit(X_train,Y_train)
predicted_labels = ensemble_model.predict(X_test)            
print("Classifier Accuracy using Hard Voting: ", accuracy_score(Y_test, predicted_labels))
print(confusion_matrix(Y_test, predicted_labels))

Classifier Accuracy using Hard Voting:  0.8333333333333334
[[7 3]
 [0 8]]


### Soft Voting

#### The below code creates an ensemble using soft voting:

In [12]:
# create the sub models
estimators = []

dt_model = DecisionTreeClassifier(random_state=1)
estimators.append(('DecisionTree', dt_model))

svm_model = SVC(random_state=1, probability=True)
estimators.append(('SupportVector', svm_model))

logit_model = LogisticRegression(random_state=1)
estimators.append(('Logistic Regression', logit_model))

for each_estimator in (dt_model, svm_model, logit_model):
    each_estimator.fit(X_train, Y_train)
    Y_pred = each_estimator.predict(X_test)
    print(each_estimator.__class__.__name__, accuracy_score(Y_test, Y_pred))
    print(confusion_matrix(Y_test, Y_pred))
    
# Using VotingClassifier() to build ensemble model with Soft Voting
ensemble_model = VotingClassifier(estimators=estimators, voting='soft')
ensemble_model.fit(X_train,Y_train)
predicted_labels = ensemble_model.predict(X_test)
print("Classifier Accuracy using Soft Voting: ", accuracy_score(Y_test, predicted_labels))
print(confusion_matrix(Y_test, predicted_labels))

DecisionTreeClassifier 0.8333333333333334
[[8 2]
 [1 7]]
SVC 0.4444444444444444
[[ 0 10]
 [ 0  8]]
LogisticRegression 0.9444444444444444
[[9 1]
 [0 8]]
Classifier Accuracy using Soft Voting:  0.8888888888888888
[[9 1]
 [1 7]]


# Averaging

#### We download the dataset whitewines.csv from the github location and copy the same to your working directory. Let's read the dataset.

In [13]:
wine_data = pd.read_csv("https://raw.githubusercontent.com/PacktPublishing/Ensemble-Machine-Learning-Cookbook/master/Chapter02/whitewines.csv")

#### Let's take a glance at the data with the below code

In [14]:
wine_data.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.7,0.62,0.24,1.1,0.039,6.0,62.0,0.9934,3.41,0.32,10.4,5
1,5.7,0.22,0.2,16.0,0.044,41.0,113.0,0.99862,3.22,0.46,8.9,6
2,5.9,0.19,0.26,7.4,0.034,33.0,123.0,0.995,3.49,0.42,10.1,6
3,5.3,0.47,0.1,1.3,0.036,11.0,74.0,0.99082,3.48,0.54,11.2,4
4,6.4,0.29,0.21,9.65,0.041,36.0,119.0,0.99334,2.99,0.34,10.933333,6


#### We import the required libraries

In [15]:
# Import required libraries
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

#### We create the response and the feature set

In [16]:
# Create feature and response variable set
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split

# create feature & response variables
feature_columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',\
                   'chlorides', 'free sulfur dioxide', 'total sulfur dioxide',\
                   'density', 'pH', 'sulphates', 'alcohol']
X = wine_data[feature_columns]
Y = wine_data['quality']

#### We split our data into train & test set

In [17]:
# Create train & test sets
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size=0.30, random_state=1)

#### We build our base regression learners with linear regression, SVR & decision tree

In [18]:
# Build base learners
linreg_model = LinearRegression()
svr_model = SVR()
regressiontree_model = DecisionTreeRegressor()


linreg_model.fit(X_train, Y_train)
svr_model.fit(X_train, Y_train)
regressiontree_model.fit(X_train, Y_train)

DecisionTreeRegressor()

#### Use the base learners to predict on the test data

In [19]:
linreg_predictions = linreg_model.predict(X_test)
svr_predictions = svr_model.predict(X_test)
regtree_predictions = regressiontree_model.predict(X_test)

In [20]:
regtree_predictions

array([5., 5., 6., ..., 6., 6., 6.])

#### We add the predictions and divide by the number of base learners

In [21]:
average_predictions=(linreg_predictions + svr_predictions + regtree_predictions)/3

In [22]:
average_predictions

array([5.24185811, 5.33852934, 5.7580648 , ..., 5.78764674, 6.18555224,
       5.70970165])

In [23]:
Y_test

2414    5
1584    4
3248    6
645     6
3163    5
       ..
1523    7
3634    6
432     5
739     6
2908    5
Name: quality, Length: 1470, dtype: int64

# Weighted Averaging


In [27]:
#os.chdir(".../Chapter 2")
#os.getcwd()

#### We download the Diagnostic Wisconsin Breast Cancer database  wisc_bc_data.csv from the github location and copy the same to your working directory. Let's read the dataset.

In [28]:
cancer_data = pd.read_csv("https://raw.githubusercontent.com/PacktPublishing/Ensemble-Machine-Learning-Cookbook/master/Chapter02/wisc_bc_data.csv")

#### Let's take a look at the data with the below code

In [29]:
cancer_data.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,points_se,symmetry_se,dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,0.1959,0.05955,0.236,0.6656,1.67,17.43,0.008045,0.0118,0.01683,0.01241,0.01924,0.002248,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,0.1922,0.06491,0.4505,1.197,3.43,27.1,0.00747,0.03581,0.03354,0.01365,0.03504,0.003318,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,905520,B,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,0.1714,0.0634,0.1967,1.387,1.342,13.54,0.005158,0.009355,0.01056,0.007483,0.01718,0.002198,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,868871,B,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,0.1771,0.06072,0.3384,1.343,1.851,26.33,0.01127,0.03498,0.02187,0.01965,0.0158,0.003442,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,0.1721,0.05544,0.1783,0.4125,1.338,17.72,0.005012,0.01485,0.01551,0.009155,0.01647,0.001767,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


In [30]:
cancer_data.shape

(569, 32)

In [31]:
cancer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 569 non-null    int64  
 1   diagnosis          569 non-null    object 
 2   radius_mean        569 non-null    float64
 3   texture_mean       569 non-null    float64
 4   perimeter_mean     569 non-null    float64
 5   area_mean          569 non-null    float64
 6   smoothness_mean    569 non-null    float64
 7   compactness_mean   569 non-null    float64
 8   concavity_mean     569 non-null    float64
 9   points_mean        569 non-null    float64
 10  symmetry_mean      569 non-null    float64
 11  dimension_mean     569 non-null    float64
 12  radius_se          569 non-null    float64
 13  texture_se         569 non-null    float64
 14  perimeter_se       569 non-null    float64
 15  area_se            569 non-null    float64
 16  smoothness_se      569 non

In [32]:
cancer_data.describe()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,points_se,symmetry_se,dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,0.405172,1.216853,2.866059,40.337079,0.007041,0.025478,0.031894,0.011796,0.020542,0.003795,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,0.277313,0.551648,2.021855,45.491006,0.003003,0.017908,0.030186,0.00617,0.008266,0.002646,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,0.1115,0.3602,0.757,6.802,0.001713,0.002252,0.0,0.0,0.007882,0.000895,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,0.2324,0.8339,1.606,17.85,0.005169,0.01308,0.01509,0.007638,0.01516,0.002248,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,0.3242,1.108,2.287,24.53,0.00638,0.02045,0.02589,0.01093,0.01873,0.003187,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,0.4789,1.474,3.357,45.19,0.008146,0.03245,0.04205,0.01471,0.02348,0.004558,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,2.873,4.885,21.98,542.2,0.03113,0.1354,0.396,0.05279,0.07895,0.02984,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


#### We import the required libraries

In [33]:
# Import required libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

#### We create the response and the feature set

In [34]:
# Create feature and response variable set
# We create train & Test sample from our dataset
from sklearn.model_selection import train_test_split

# create feature & response variables
X = cancer_data.iloc[:,2:32]
Y = cancer_data['diagnosis']

In [35]:
# Create train & test sets
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size=0.30, random_state=1)

#### We build our base classifier models

In [36]:
# create the sub models
estimators = []

dt_model = DecisionTreeClassifier()
estimators.append(('DecisionTree', dt_model))

svm_model = SVC(probability=True)
estimators.append(('SupportVector', svm_model))

logit_model = LogisticRegression()
estimators.append(('Logistic Regression', logit_model))

#### We fit our models on the test data

In [37]:
dt_model.fit(X_train, Y_train)
svm_model.fit(X_train, Y_train)
logit_model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [38]:
#### We use the predict_proba() function to predict the class probabilities

In [39]:
dt_predictions = dt_model.predict_proba(X_test)
svm_predictions = svm_model.predict_proba(X_test)
logit_predictions = logit_model.predict_proba(X_test)

#### We assign different weights to each of the models to get our final predictions

In [40]:
weighted_average_predictions=(dt_predictions * 0.3 + svm_predictions * 0.4 + logit_predictions * 0.3)

In [41]:
weighted_average_predictions

array([[2.55984263e-02, 9.74401574e-01],
       [1.01029450e-02, 9.89897055e-01],
       [2.29324794e-01, 7.70675206e-01],
       [5.50796315e-03, 9.94492037e-01],
       [9.86311091e-01, 1.36889088e-02],
       [9.57127878e-01, 4.28721223e-02],
       [9.94240973e-01, 5.75902747e-03],
       [8.07628521e-01, 1.92371479e-01],
       [9.94313015e-01, 5.68698523e-03],
       [9.87834292e-01, 1.21657085e-02],
       [9.90489064e-01, 9.51093602e-03],
       [9.93608864e-01, 6.39113563e-03],
       [9.52034414e-01, 4.79655856e-02],
       [9.90574363e-01, 9.42563678e-03],
       [9.85543551e-01, 1.44564492e-02],
       [2.05154826e-05, 9.99979485e-01],
       [9.88568406e-01, 1.14315941e-02],
       [9.44916746e-01, 5.50832544e-02],
       [4.28022070e-01, 5.71977930e-01],
       [9.95572768e-01, 4.42723227e-03],
       [9.78194290e-01, 2.18057099e-02],
       [9.86512776e-01, 1.34872240e-02],
       [5.63450961e-01, 4.36549039e-01],
       [9.40775484e-01, 5.92245164e-02],
       [9.843358

In [41]:
# Challenge: crea un modelo ponderado con un dataset de tu gusto obteniendo una clasificación binaria, elige 3 modelos de algoritmo diferente de clasificación
