# Task: To implement Bagging technique using 100 decision trees

In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np

In [2]:
# importing data
data = pd.read_csv(r"C:\Users\mdine\Downloads\Module_8_assignment\Exercise\churn_prediction.csv")
data.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,0,0.0,0,187.0,2,755,224.0,...,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,6,2531,42,0,2.0,0,1494.0,3,388,58.0,...,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1
2,7,263,42,1,0.0,0,1096.0,2,1666,60.0,...,16059.34,15211.29,13798.82,0.36,0.36,857.5,286.07,15719.44,15349.75,0
3,8,5922,72,0,0.0,1,1020.0,1,1,98.0,...,7714.19,7859.74,11232.37,0.64,0.64,1299.64,439.26,7076.06,7755.98,0
4,9,1145,46,0,0.0,0,623.0,2,317,172.0,...,8519.53,6511.82,16314.17,0.27,0.27,443.13,5688.44,8563.84,5317.04,0


In [3]:
predictors = data.drop(columns=['churn','customer_id'])
target = data['churn']

## Steps to implement bagging technique with "n" number of trees
1. Determine the (n_trees) number of trees
2. Make (n_trees) number of bootstrap samples
3. For each bootstrap samples build a decision tree model and generate predictions
4. For every observation in test set, calculate the model of predictions made by n_trees
5. Calculate F1 score for the final predictions

In [4]:
#splitting data
train = data.sample(frac=0.75, replace = False)
test = data.append(train)
test = test.drop_duplicates(keep=False)

#check
train.shape, test.shape

  test = data.append(train)


((16550, 21), (5517, 21))

In [5]:
# step1 : Setting n_trees_trees
n_trees = 100

In [7]:
# Step 2: Defining the BootStrap function
def BootStrap(data, n_trees, fraction=1):
    '''
    Function to generate bootstrap samples.
    data : data to generate bootstrap samples from
    n_samples : number of samples to create
    fraction : what fraction of data should be each sample (default=1)

    Return: a list of bootstrap samples (each sample is a DataFrame)
    '''
    bootstrap_samples = []
    
    for _ in range(n_trees):
        # Randomly sample with replacement from the data
        sample = data.sample(frac=fraction, replace=True)
        bootstrap_samples.append(sample)
    
    return bootstrap_samples

# Generate bootstrap samples
bootstrap_samples = BootStrap(train, n_trees)

# Check the number of bootstrap samples
print("Number of bootstrap samples:", len(bootstrap_samples))

# Access an individual bootstrap sample (for example, the first sample)
first_bootstrap_sample = bootstrap_samples[0]
print("First bootstrap sample:")
print(first_bootstrap_sample)

Number of bootstrap samples: 100
First bootstrap sample:
       customer_id  vintage  age  gender  dependents  occupation    city  \
1017          1401     6117   80       1         0.0           0  1020.0   
15871        21755     1314   23       1         0.0           0   848.0   
7844         10718     2686   42       0         0.0           2  1271.0   
8789         12047     4651   38       1         0.0           0  1494.0   
8871         12175     2124    9       1         0.0           3   601.0   
...            ...      ...  ...     ...         ...         ...     ...   
19388        26563     1658   58       1         0.0           2  1020.0   
21809        29933     4094   56       1         1.0           2  1020.0   
7930         10858      871   55       0         0.0           0   180.0   
21221        29104     2733   74       0         0.0           0   487.0   
11969        16397     3114   45       1         0.0           0   363.0   

       customer_nw_category  b

In [11]:
def test_BootStrap(train, n_trees):
    for fraction in [1, 0.75, 0.5]:
        bootstrap_samples = BootStrap(train, n_trees, fraction=fraction)
        num_samples = len(bootstrap_samples)
        sample_shape = bootstrap_samples[0].shape if num_samples > 0 else (0, 0)
        print(f'{sample_shape[0]}, {sample_shape[1]}, {num_samples}')


In [13]:

# Call the testing function
test_BootStrap(train, n_trees)

16550, 21, 100
12412, 21, 100
8275, 21, 100


### Expected Outcome

<img src="images/image1.png">

In [14]:
from sklearn.tree import DecisionTreeClassifier

def generate_predictions(train_x, train_y, test_x):
    '''
    train_x: independent variables of sample
    train_y: target variable of sample
    test_x: independent variables of test data

    Return: predictions for the test_x
    '''
    # Create a Decision Tree classifier
    model = DecisionTreeClassifier(random_state=1)
    
    # Fit the model on the bootstrap sample
    model.fit(train_x, train_y)
    
    # Make predictions on the test data
    predictions = model.predict(test_x)
    
    return predictions


In [15]:
# Test the generate_predictions function
expected_length = test.shape[0]
predictions = generate_predictions(train.drop(columns=['churn']), train['churn'], test.drop(columns=['churn']))

print('Expected length of predictions:', expected_length)
print('Generated predictions shape:', len(predictions))

# Check if the lengths match
if len(predictions) == expected_length:
    print('Lengths match!')
else:
    print('Lengths do not match!')


Expected length of predictions: 5517
Generated predictions shape: 5517
Lengths match!


In [16]:
from scipy.stats import mode
import numpy as np

def Bagging(n_trees, train_data, test_data, sample_fraction):
    '''
    n_trees: number of trees
    train_data: the training dataset (including independent variables and target variable)
    test_data: the test dataset (including independent variables and target variable)
    sample_fraction: fraction of data to create each bootstrap sample
    
    Return: final predictions of the bagging technique
    '''
    final_predictions = []

    for _ in range(n_trees):
        # Generate a bootstrap sample
        bootstrap_sample = BootStrap(train_data, 1, sample_fraction)[0]

        # Separate the independent variables and target variable
        bootstrap_train_x = bootstrap_sample.drop(columns=['churn'])
        bootstrap_train_y = bootstrap_sample['churn']

        # Generate predictions for the bootstrap sample
        bootstrap_predictions = generate_predictions(bootstrap_train_x, bootstrap_train_y, test_data.drop(columns=['churn']))

        # Append the predictions to the final_predictions list
        final_predictions.append(bootstrap_predictions)

    # Calculate the mode of predictions for each sample
    mode_predictions = [mode(sample_predictions, axis=0).mode[0] for sample_predictions in np.array(final_predictions).T]

    return mode_predictions


In [17]:
bagging_predictions = Bagging(n_trees, train, test, 1)


  mode_predictions = [mode(sample_predictions, axis=0).mode[0] for sample_predictions in np.array(final_predictions).T]


In [18]:
bagging_predictions

[0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
