### Hard voting

In [1]:
!pip install tqdm




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import pandas as pd
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree, DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *
from sklearn.preprocessing import *
from sklearn.compose import *
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingClassifier, VotingRegressor, RandomForestClassifier
from tqdm import tqdm

In [3]:
df = pd.read_csv(r"D:\Cases\Sonar\Sonar.csv")
df.head()
df.isnull().sum()

V1       0
V2       0
V3       0
V4       0
V5       0
        ..
V57      0
V58      0
V59      0
V60      0
Class    0
Length: 61, dtype: int64

In [4]:
x, y = df.drop('Class', axis=1), df['Class']

In [5]:
le = LabelEncoder()
y = le.fit_transform(y)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((145, 60), (63, 60), (145,), (63,))

In [7]:
knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=7)
dtc1 = DecisionTreeClassifier(random_state=42)
dtc2 = DecisionTreeClassifier(random_state=42, max_depth=3)
nb = GaussianNB()

In [8]:
voting = VotingClassifier(estimators=[
    ('Tree1', dtc1),
    ('Tree2', dtc2),
    ("Naive_Bayes", nb),
    ('Knn1', knn1),
    ('Knn2', knn2)
])

In [9]:
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)

In [10]:
print(classification_report(y_test, y_pred))
print()
for i in range(len(voting.estimators_)):
    print('Estimators:', voting.estimators_[i])
    print('Accuracy_score:', accuracy_score(y_test, voting.estimators_[i].predict(X_test)))



              precision    recall  f1-score   support

           0       0.72      0.76      0.74        34
           1       0.70      0.66      0.68        29

    accuracy                           0.71        63
   macro avg       0.71      0.71      0.71        63
weighted avg       0.71      0.71      0.71        63


Estimators: DecisionTreeClassifier(random_state=42)
Accuracy_score: 0.6190476190476191
Estimators: DecisionTreeClassifier(max_depth=3, random_state=42)
Accuracy_score: 0.7142857142857143
Estimators: GaussianNB()
Accuracy_score: 0.6190476190476191
Estimators: KNeighborsClassifier()
Accuracy_score: 0.7301587301587301
Estimators: KNeighborsClassifier(n_neighbors=7)
Accuracy_score: 0.7142857142857143


### soft voting

In [11]:
voting_soft = VotingClassifier(estimators=[
    ('Tree1', dtc1),
    ('Tree2', dtc2),
    ("Naive_Bayes", nb),
    ('Knn1', knn1),
    ('Knn2', knn2)
],voting='soft')

In [12]:
voting_soft.fit(X_train, y_train)

0,1,2
,estimators,"[('Tree1', ...), ('Tree2', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,priors,
,var_smoothing,1e-09

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,n_neighbors,7
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [13]:
print(classification_report(y_test, y_pred))
print()
for i in range(len(voting_soft.estimators_)):
    print('Estimators:', voting_soft.estimators_[i])
    print('Accuracy_score:', accuracy_score(y_test, voting_soft.estimators_[i].predict(X_test)))

              precision    recall  f1-score   support

           0       0.72      0.76      0.74        34
           1       0.70      0.66      0.68        29

    accuracy                           0.71        63
   macro avg       0.71      0.71      0.71        63
weighted avg       0.71      0.71      0.71        63


Estimators: DecisionTreeClassifier(random_state=42)
Accuracy_score: 0.6190476190476191
Estimators: DecisionTreeClassifier(max_depth=3, random_state=42)
Accuracy_score: 0.7142857142857143
Estimators: GaussianNB()
Accuracy_score: 0.6190476190476191
Estimators: KNeighborsClassifier()
Accuracy_score: 0.7301587301587301
Estimators: KNeighborsClassifier(n_neighbors=7)
Accuracy_score: 0.7142857142857143


### weighting

In [14]:
voting_weigh = VotingClassifier(estimators=[
    ('Tree1', dtc1),
    ('Tree2', dtc2),
    ('Knn1', knn1),
    ('Knn2', knn2),
    ("Naive_Bayes", nb)
],weights=[7,6,7.5,8.1,6.3])

In [15]:
voting_weigh.fit(X_train, y_train)

0,1,2
,estimators,"[('Tree1', ...), ('Tree2', ...), ...]"
,voting,'hard'
,weights,"[7, 6, ...]"
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,n_neighbors,7
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,priors,
,var_smoothing,1e-09


In [16]:
print(classification_report(y_test, y_pred))
print()
for i in range(len(voting_weigh.estimators_)):
    print('Estimators:', voting_weigh.estimators_[i])
    print('Accuracy_score:', accuracy_score(y_test, voting_weigh.estimators_[i].predict(X_test)))

              precision    recall  f1-score   support

           0       0.72      0.76      0.74        34
           1       0.70      0.66      0.68        29

    accuracy                           0.71        63
   macro avg       0.71      0.71      0.71        63
weighted avg       0.71      0.71      0.71        63


Estimators: DecisionTreeClassifier(random_state=42)
Accuracy_score: 0.6190476190476191
Estimators: DecisionTreeClassifier(max_depth=3, random_state=42)
Accuracy_score: 0.7142857142857143
Estimators: KNeighborsClassifier()
Accuracy_score: 0.7301587301587301
Estimators: KNeighborsClassifier(n_neighbors=7)
Accuracy_score: 0.7142857142857143
Estimators: GaussianNB()
Accuracy_score: 0.6190476190476191


In [17]:
from category_encoders import TargetEncoder
import pandas as pd

# Sample data with high cardinality
data = pd.DataFrame({
    'category': [f'cat_{i%20}' for i in range(1000)],  # 20 categories
    'target': [i % 2 for i in range(1000)]  # Binary target
})

encoder = TargetEncoder()
encoded = encoder.fit_transform(data['category'], data['target'])
print(f"Original categories: {data['category'].nunique()}")
print(f"Encoded features: 1 column")
encoded

Original categories: 20
Encoded features: 1 column


Unnamed: 0,category
0,0.023713
1,0.976287
2,0.023713
3,0.976287
4,0.023713
...,...
995,0.976287
996,0.023713
997,0.976287
998,0.023713


In [18]:
import pandas as pd

# High cardinality data
data = pd.DataFrame({
    'category': [f'cat_{i%50}' for i in range(1000)]  # 50 categories
})

# Frequency encoding
freq_map = data['category'].value_counts().to_dict()

data['category_encoded'] = data['category'].map(freq_map)

print(f"Original: {data['category'].nunique()} categories")
print(f"Encoded: 1 numerical feature")
data['category_encoded']
freq_map

Original: 50 categories
Encoded: 1 numerical feature


{'cat_0': 20,
 'cat_1': 20,
 'cat_2': 20,
 'cat_3': 20,
 'cat_4': 20,
 'cat_5': 20,
 'cat_6': 20,
 'cat_7': 20,
 'cat_8': 20,
 'cat_9': 20,
 'cat_10': 20,
 'cat_11': 20,
 'cat_12': 20,
 'cat_13': 20,
 'cat_14': 20,
 'cat_15': 20,
 'cat_16': 20,
 'cat_17': 20,
 'cat_18': 20,
 'cat_19': 20,
 'cat_20': 20,
 'cat_21': 20,
 'cat_22': 20,
 'cat_23': 20,
 'cat_24': 20,
 'cat_25': 20,
 'cat_26': 20,
 'cat_27': 20,
 'cat_28': 20,
 'cat_29': 20,
 'cat_30': 20,
 'cat_31': 20,
 'cat_32': 20,
 'cat_33': 20,
 'cat_34': 20,
 'cat_35': 20,
 'cat_36': 20,
 'cat_37': 20,
 'cat_38': 20,
 'cat_39': 20,
 'cat_40': 20,
 'cat_41': 20,
 'cat_42': 20,
 'cat_43': 20,
 'cat_44': 20,
 'cat_45': 20,
 'cat_46': 20,
 'cat_47': 20,
 'cat_48': 20,
 'cat_49': 20}

### Decision Tree Regressor

In [19]:
from sklearn.linear_model import LinearRegression

conk = pd.read_csv(r"D:\Cases\Concrete_Strength\Concrete_Data.csv")
X = conk.drop("Strength", axis=1)
y = conk["Strength"]
X.shape, y.shape
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)
dtc = DecisionTreeRegressor(random_state=25)
lr = LinearRegression()
el = ElasticNet()
voting = VotingRegressor([("TREE", dtc), ("LR", lr), ("EL", el)], weights=[100, 0, 50])
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print(r2_score(y_test, y_pred))


0.8544426265430402


### Bagging- bootstrap sampling

In [20]:
df = pd.read_csv(r"D:\Cases\Sonar\Sonar.csv")
df.head()
df.isnull().sum()

V1       0
V2       0
V3       0
V4       0
V5       0
        ..
V57      0
V58      0
V59      0
V60      0
Class    0
Length: 61, dtype: int64

In [21]:
X,y = df.drop('Class',axis = 1),df['Class']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

dtc = DecisionTreeClassifier(random_state = 25)
knn = KNeighborsClassifier()
nb = GaussianNB()
lr = LogisticRegression()

In [24]:
bagg = BaggingClassifier(random_state = 25,estimator = nb, n_estimators = 10)
bagg.fit(X_train,y_train)
y_pred = bagg.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           M       0.80      0.50      0.62        32
           R       0.63      0.87      0.73        31

    accuracy                           0.68        63
   macro avg       0.71      0.69      0.67        63
weighted avg       0.72      0.68      0.67        63



In [25]:
est_lst = [nb,dtc,knn,lr]
n_est = [10,15,25,50]
scores = []




for e in tqdm(est_lst):
    for n in n_est:
        bagg = BaggingClassifier(random_state = 25,n_estimators = n,estimator = e)
        bagg.fit(X_train,y_train)
        y_pred_prob = bagg.predict_proba(X_test)
        scores.append([e,n,log_loss(y_test,y_pred_prob)])
df_scores = pd.DataFrame(scores,columns = ['Estimator','B-sample','score'])
df_scores.sort_values('score')

100%|██████████| 4/4 [00:03<00:00,  1.02it/s]


Unnamed: 0,Estimator,B-sample,score
5,DecisionTreeClassifier(random_state=25),15,0.398599
6,DecisionTreeClassifier(random_state=25),25,0.406923
7,DecisionTreeClassifier(random_state=25),50,0.41524
4,DecisionTreeClassifier(random_state=25),10,0.422659
13,LogisticRegression(),15,0.488241
12,LogisticRegression(),10,0.489063
14,LogisticRegression(),25,0.48918
15,LogisticRegression(),50,0.491791
9,KNeighborsClassifier(),15,0.518894
8,KNeighborsClassifier(),10,0.528152


### Ramndom Forest Classifier

In [28]:
features = [2,3,4,5,6,7,8,9,10]
scores = []
for f  in features:
    rf = RandomForestClassifier(random_state = 25,max_features = f)
    rf.fit(X_train,y_train)
    pred = rf.predict(X_test)
    scores.append([f,accuracy_score(y_test,y_pred)])
    df_scores = pd.DataFrame(scores,columns = ['features','score'])
    df_scores.sort_values('score',ascending = False)

In [29]:
df_scores

Unnamed: 0,features,score
0,2,0.68254
1,3,0.68254
2,4,0.68254
3,5,0.68254
4,6,0.68254
5,7,0.68254
6,8,0.68254
7,9,0.68254
8,10,0.68254
