This file will go through the bagging technique of ensemble learning.  

In [7]:
# imports 
import random
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


Step 1: Generate a random dataset with 20 samples.  Each should have two input labels and 1 output label.

In [8]:
np.random.seed(120)

# generate random data with last column as target
data = np.random.rand(20,2)
target = np.random.randint(0,2,20)

data = np.concatenate((data, target.reshape(-1,1)), axis=1)

print(data)


[[0.67795555 0.5129588  1.        ]
 [0.62370571 0.47714247 1.        ]
 [0.46025981 0.95086487 0.        ]
 [0.9845756  0.85442255 1.        ]
 [0.59109035 0.20319637 0.        ]
 [0.26236016 0.06641525 0.        ]
 [0.64693501 0.71873295 1.        ]
 [0.35104926 0.95254348 0.        ]
 [0.59547743 0.99450621 0.        ]
 [0.54367354 0.07171263 0.        ]
 [0.14068668 0.20727832 0.        ]
 [0.50552741 0.53539293 1.        ]
 [0.53145508 0.23291118 0.        ]
 [0.60550138 0.70289106 0.        ]
 [0.06662988 0.2366768  1.        ]
 [0.15895731 0.97330243 0.        ]
 [0.71025347 0.08285516 1.        ]
 [0.54836989 0.1424483  0.        ]
 [0.10721273 0.61757862 0.        ]
 [0.91187275 0.90530837 0.        ]]


Step 2: Generate 10 training datasets with a size of 20, by sampling with repetition and produce output as a 10 x 20 matrix

In [14]:
np.random.seed(120)

training_data_lst = []

for _ in range(10):
    indices = np.random.choice(20,20, replace=True)
    training_data = np.random.rand(20,2)[indices]
    training_target = np.random.randint(0,2,20)[indices]
    training_data = np.concatenate((training_data, training_target.reshape(-1,1)), axis=1)  
    training_data_lst.append(training_data)
    
# convert into numpy array
training_data_lst = np.array(training_data_lst)
print(training_data_lst)
#

    


[[[0.15895731 0.97330243 0.        ]
  [0.59547743 0.99450621 1.        ]
  [0.10721273 0.61757862 1.        ]
  [0.81672161 0.45674108 0.        ]
  [0.0643363  0.46147051 1.        ]
  [0.10621653 0.33940194 1.        ]
  [0.54367354 0.07171263 0.        ]
  [0.31941841 0.69046799 0.        ]
  [0.54836989 0.1424483  0.        ]
  [0.54836989 0.1424483  0.        ]
  [0.54836989 0.1424483  0.        ]
  [0.06662988 0.2366768  1.        ]
  [0.18047251 0.05859209 1.        ]
  [0.26232429 0.26463542 0.        ]
  [0.54367354 0.07171263 0.        ]
  [0.53145508 0.23291118 1.        ]
  [0.60550138 0.70289106 1.        ]
  [0.15895731 0.97330243 0.        ]
  [0.26232429 0.26463542 0.        ]
  [0.29197057 0.85226147 0.        ]]

 [[0.65102413 0.29737596 0.        ]
  [0.89960275 0.67574196 0.        ]
  [0.42978814 0.40965052 0.        ]
  [0.76086314 0.63246615 1.        ]
  [0.38643837 0.51336481 0.        ]
  [0.33365635 0.00433633 1.        ]
  [0.18771727 0.70312797 0.        ]

Step 3: Train 10 classifiers models on the 10 datasets

In [18]:
# save the log reg models in a list or specifically their accuracy metric
log_reg_models = []


for i in range(10):
    # create a new log reg model 
    log_reg = LogisticRegression(solver='liblinear', C=1.0)
    
    # access first two cols of np array subarray
    sub_np_arr = training_data_lst[i]
    X = sub_np_arr[:, :2]
    y = sub_np_arr[:, -1]
    # train and predict on each dataset from training_data_lst 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # fit the log_reg on the training data and predict with testing
    log_reg.fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)
    
    # predict accuracy to get the score then save to array
    accuracy = accuracy_score(y_test, y_pred)
    log_reg_models.append(accuracy)
    
    
print(log_reg_models)
    
    




[0.5, 0.75, 0.5, 0.25, 0.75, 0.5, 1.0, 0.75, 0.75, 0.75]


Use Majority Voting to get combine results and produce final output.

In [30]:
# perform majority voting using acc scores 
majority_vote = np.mean(log_reg_models) >= 0.5

# make a dictionary with key as the accuracy score and value is the number of times it occurs.

mode = dict()

for i in log_reg_models:
    if i in mode:
        mode[i] += 1
    else:
        mode[i] = 1

max_val = max(mode.values())
max_key = [key for key, value in mode.items() if max_val == value]

print(f"Key: {max_key}, Value: {max_val}")
print(majority_vote)

Key: [0.75], Value: 5
True
