# Imports

In [21]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import metrics


# Load and split the data

## Classification - breast cancer

In [2]:
# Load dataset
dataset = load_breast_cancer(return_X_y=True)
Xc, yc = load_breast_cancer(return_X_y=True)
feature_names = load_breast_cancer().feature_names

Xc = pd.DataFrame(Xc, columns=feature_names)
yc = pd.Series(yc)


# Split into train/val/test
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=42, stratify=yc)

## Regression - cars prices

In [3]:
data = "https://storage.googleapis.com/edulabs-public-datasets/CAR%20DETAILS%20FROM%20CAR%20DEKHO.csv"
df = pd.read_csv(data)
df['manufacturer'] = df['name'].str.split(' ').str[0]

Xr = df.drop(['name', 'selling_price'], axis=1)
yr = df['selling_price']

# One-hot encode categorical features
Xr = pd.get_dummies(Xr, drop_first=True)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.2, random_state=42)

# Voting - regression

In [11]:
reg1 = KNeighborsRegressor(n_neighbors=1)
reg2 = DecisionTreeRegressor(max_depth=4)
reg3 = LinearRegression()
ereg = VotingRegressor(estimators=[('knn', reg1), ('cart', reg2), ('rf', reg3)])
ereg.fit(Xr_train, yr_train)
print(f"R2 score (test): {ereg.score(Xr_test, yr_test)}")
print(f"R2 score (train): {ereg.score(Xr_train, yr_train)}")
print(f"MAPE (test): {metrics.mean_absolute_percentage_error(yr_test, ereg.predict(Xr_test))}")
print(f"MAPE (train): {metrics.mean_absolute_percentage_error(yr_train, ereg.predict(Xr_train))}")

R2 score (test): 0.6067640250796862
R2 score (train): 0.8931566513292928
MAPE (test): 0.4274312762695505
MAPE (train): 0.3077222976170646


# Voting - hard classifier

In [19]:
cls1 = LogisticRegression(random_state=433, max_iter=100)
cls2 = RandomForestClassifier(n_estimators=100, random_state=433)
cls3 = KNeighborsClassifier()

voting_clf = VotingClassifier(
    estimators=[('lr', cls1), ('rf', cls2), ('knn', cls3)],
    voting='hard')
voting_clf.fit(Xc_train, yc_train)
print("\nClassification Report:\n", classification_report(yc_test, voting_clf.predict(Xc_test)))
print("\nConfusion Matrix:\n", confusion_matrix(yc_test, voting_clf.predict(Xc_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.90      0.92        42
           1       0.95      0.96      0.95        72

    accuracy                           0.94       114
   macro avg       0.94      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114


Confusion Matrix:
 [[38  4]
 [ 3 69]]


# Voting - soft classifier

In [20]:
cls1 = LogisticRegression(random_state=433, max_iter=100)
cls2 = RandomForestClassifier(n_estimators=100, random_state=433)
cls3 = KNeighborsClassifier()

voting_clf = VotingClassifier(
    estimators=[('lr', cls1), ('rf', cls2), ('knn', cls3)],
    voting='soft')
voting_clf.fit(Xc_train, yc_train)
print("\nClassification Report:\n", classification_report(yc_test, voting_clf.predict(Xc_test)))
print("\nConfusion Matrix:\n", confusion_matrix(yc_test, voting_clf.predict(Xc_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.90      0.93        42
           1       0.95      0.97      0.96        72

    accuracy                           0.95       114
   macro avg       0.95      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114


Confusion Matrix:
 [[38  4]
 [ 2 70]]


# Exercise

**Fix the problem in the upper part, and try running voting classifier and regressor again, also tune params and models to get better results**

# Stacking

In [24]:
cls1 = RandomForestClassifier(n_estimators=100, random_state=433)
cls2 = DecisionTreeClassifier(max_depth=4)
cls3 = KNeighborsClassifier()

lr = LogisticRegression()
sclf = StackingClassifier(
    estimators=[('rf', cls1), ('lr', cls2), ('knn', cls3)],
    final_estimator=lr,
    cv=None,                # None, to use the default 5-fold cross validation,
    passthrough=False      # When False, only the predictions of estimators will be used as training data for final_estimator.
                            # When True, the final_estimator is trained on the predictions as well as the original training data.
)

# Train the stacking model
sclf.fit(Xc_train, yc_train)


In [25]:
print("\nClassification Report:\n", classification_report(yc_test, sclf.predict(Xc_test)))
print("\nConfusion Matrix:\n", confusion_matrix(yc_test, sclf.predict(Xc_test)))


Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        42
           1       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


Confusion Matrix:
 [[39  3]
 [ 2 70]]
