In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [2]:
# Replace 'your_file.csv' with the path to your ACC data file
file_path = '/home/darshan/Documents/Accounts_Multiclass_dataset.csv'
df_acc = pd.read_csv(file_path)

df_acc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4382 entries, 0 to 4381
Data columns (total 36 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Unnamed: 0                                           4382 non-null   int64  
 1   madison_logic_engagement_score                       4382 non-null   float64
 2   six_sence_engagement                                 4382 non-null   int64  
 3   outreach_calls_total_calls                           4382 non-null   float64
 4   outreach_calls_answered_calls                        4382 non-null   float64
 5   outreach_calls_inbound_calls                         4382 non-null   float64
 6   outreach_calls_outbound_calls                        4382 non-null   float64
 7   outreach_calls_total_answered_duration               4382 non-null   float64
 8   outreach_emails_frequency                            4382 non-null  

In [3]:
df_acc['Furthest_Stage_Reached'].unique()

array(['0-40', '40-60', '70-100'], dtype=object)

In [4]:
# Prepare the data
X = df_acc.drop(columns=["Furthest_Stage_Reached", "Unnamed: 0"])
y = df_acc["Furthest_Stage_Reached"]

In [5]:
# Encode target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (3505, 34)
Shape of X_test: (877, 34)
Shape of y_train: (3505,)
Shape of y_test: (877,)


In [7]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],        # Number of base estimators
    'max_samples': [0.5, 0.7, 1.0],       # Fraction of samples used for each estimator
    'max_features': [0.5, 0.7, 1.0],      # Fraction of features used for each estimator
    'estimator__max_depth': [None, 5, 10] # Maximum depth of each decision tree
}

# Initialize Bagging Classifier with a Decision Tree as the base estimator
bagging_classifier = BaggingClassifier(estimator=DecisionTreeClassifier(random_state=42), random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1_macro')

# Fit the grid search
grid_search.fit(X_train, y_train)

In [8]:
# Print the best parameters and best F1 score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation F1 Score:", grid_search.best_score_)

Best Parameters: {'estimator__max_depth': 10, 'max_features': 0.7, 'max_samples': 0.7, 'n_estimators': 100}
Best Cross-Validation F1 Score: 0.6076436728604089


In [11]:
# Retrain the best model on the entire training dataset
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate the best model on the test set using F1 score
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred, average='macro')  # Use 'binary' for binary classification
print("Test F1 Score:", test_f1)

Test F1 Score: 0.6209534035832865
