# Import libs

In [2]:
# sklearn libs
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Lasso

# implemented structure
from src.handleData import handleData
from src.handleClassifier import handleClassifier

# Load data

In [3]:
hD = handleData("./data/data.csv")  # construct handleData object
df = hD.loadData("id")  # load the data
labels = hD.encodeLabels(df, "species")  # get labels
dfTrain, dfTest = hD.splitData(df, 0.7, "species")  # get the training and testing sets

# Without any feature trimming

In [4]:
# The selected classifiers and their config
classifierList = [
    {
        "name": "LinearDiscriminantAnalysis",
        "preprocess": False,
        "fitStrategy": {
            "option": "CV",
            "config": {"n_jobs": -1, "scoring": "accuracy"},
        },
    },
    {
        "name": "QuadraticDiscriminantAnalysis",
        "preprocess": False,
        "fitStrategy": {
            "option": "CV",
            "config": {"n_jobs": -1, "scoring": "accuracy"},
        },
    },
    {
        "name": "DecisionTreeClassifier",
        "preprocess": False,
        "fitStrategy": {
            "option": "GridSearch",
            "config": {
                "param_grid": {
                    "clf__min_samples_leaf": list(range(1, 6)),
                    "clf__min_samples_split": list(range(2, 12, 2)),
                    "clf__max_depth": list(range(1, 11)),
                },
                "n_jobs": -1,
                "scoring": "accuracy",
            },
        },
    },
    {
        "name": "RandomForestClassifier",
        "preprocess": False,
        "fitStrategy": {
            "option": "GridSearch",
            "config": {
                "param_grid": {
                    "clf__min_samples_leaf": list(range(1, 6)),
                    "clf__min_samples_split": list(range(2, 12, 2)),
                    "clf__n_estimators": [50, 75, 100],
                    "clf__max_depth": list(range(1, 6)),
                },
                "n_jobs": -1,
                "scoring": "accuracy",
            },
        },
    },
    {
        "name": "AdaBoostClassifier",
        "preprocess": False,
        "config": {"base_estimator": DecisionTreeClassifier()},
        "fitStrategy": {
            "option": "GridSearch",
            "config": {
                "param_grid": {
                    "clf__base_estimator__max_depth": [1, 5, 10],
                    "clf__n_estimators": [50, 75, 100],
                    "clf__learning_rate": [0.01, 0.1, 1.0],
                },
                "n_jobs": -1,
                "scoring": "accuracy",
            },
        },
    },
    {
        "name": "BaggingClassifier",
        "preprocess": False,
        "config": {"base_estimator": DecisionTreeClassifier()},
        "fitStrategy": {
            "option": "GridSearch",
            "config": {
                "param_grid": {
                    "clf__base_estimator__max_depth": [1, 5, 10],
                    "clf__n_estimators": [50, 75, 100],
                    "clf__max_samples": [0.25, 0.5, 0.75, 1.0],
                    "clf__max_features": [0.25, 0.5, 0.75, 1.0],
                },
                "n_jobs": -1,
                "scoring": "accuracy",
            },
        },
    },
]

hC = handleClassifier()  # construct handleClassifier object
clf = hC.fitClassifiers(dfTrain, classifierList)  # fit

AdaBoostClassifier, GridSearchCV best score = 0.8888854134084037
BaggingClassifier, GridSearchCV best score = 0.864320717339172
DecisionTreeClassifier, GridSearchCV best score = 0.17901157335001563
LinearDiscriminantAnalysis, CV score = 0.9610363882806798
QuadraticDiscriminantAnalysis, CV score = 0.02452299030340945
RandomForestClassifier, GridSearchCV best score = 0.7214888958398499


# With feature reduction (PCA)

In [5]:
# change the config for a feature reduction
for classifierDict in classifierList:
    classifierDict["preprocess"] = True
    classifierDict["feature"] = {"option": "reduction"}
    classifierDict["fitStrategy"]["option"] = "GridSearch"

    if not "param_grid" in classifierDict["fitStrategy"]["config"]:
        classifierDict["fitStrategy"]["config"]["param_grid"] = dict()

    classifierDict["fitStrategy"]["config"]["param_grid"]["ftr__n_components"] = [
        0.5,
        0.75,
        0.85,
    ]

clfReduction = hC.fitClassifiers(dfTrain, classifierList)  # fit

AdaBoostClassifier, GridSearchCV best score = 0.8932436659368157
BaggingClassifier, GridSearchCV best score = 0.9335731414868105
DecisionTreeClassifier, GridSearchCV best score = 0.4561046814722135
LinearDiscriminantAnalysis, GridSearchCV best score = 0.9696799082473152
QuadraticDiscriminantAnalysis, GridSearchCV best score = 0.23372953810864355
RandomForestClassifier, GridSearchCV best score = 0.7431237618600772


# With feature selection (Lasso)

In [6]:
# change the config for a feature selection
for classifierDict in classifierList:
    classifierDict["feature"]["option"] = "selection"
    classifierDict["feature"]["config"] = {"estimator": Lasso()}
    del classifierDict["fitStrategy"]["config"]["param_grid"]["ftr__n_components"]
    classifierDict["fitStrategy"]["config"]["param_grid"]["ftr__estimator__alpha"] = [
        1.0,
        2.5,
        5.0,
    ]

clfSelection = hC.fitClassifiers(dfTrain, classifierList)  # fit

AdaBoostClassifier, GridSearchCV best score = 0.8384005838807216
BaggingClassifier, GridSearchCV best score = 0.89617349598582
DecisionTreeClassifier, GridSearchCV best score = 0.23806693775414453
LinearDiscriminantAnalysis, GridSearchCV best score = 0.9004379105411322
QuadraticDiscriminantAnalysis, GridSearchCV best score = 0.030288812428318213
RandomForestClassifier, GridSearchCV best score = 0.7257845897195286
