# Import libs

In [1]:
# sklearn libs
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Lasso

# implemented structure
from src.handleData import handleData
from src.handleClassifier import handleClassifier

# Load data

In [2]:
hD = handleData("./data/data.csv")  # construct handleData object
df = hD.loadData("id")  # load the data
labels = hD.encodeLabels(df, "species")  # get labels
dfTrain, dfTest = hD.splitData(df, 0.7, "species")  # get the training and testing sets

# Without any feature trimming

In [3]:
# The selected classifiers and their config
classifierList = [
    {
        "name": "LinearDiscriminantAnalysis",
        "preprocess": False,
        "fitStrategy": {
            "option": "CV",
            "config": {"n_jobs": -1, "scoring": "accuracy"},
        },
    },
    {
        "name": "QuadraticDiscriminantAnalysis",
        "preprocess": False,
        "fitStrategy": {
            "option": "CV",
            "config": {"n_jobs": -1, "scoring": "accuracy"},
        },
    },
    {
        "name": "LogisticRegression",
        "preprocess": True,
        "fitStrategy": {
            "option": "CV", 
            "config": {"scoring": "accuracy"}
        },
    },
    {
       "name": "SGDClassifier",
       "preprocess": True,
       "fitStrategy": {
            "option": "CV", 
            "config": {"scoring": "accuracy"}
        },
        "loss":"log_loss"
    },
    {
        "name": "AdaBoostClassifier",
        "preprocess": False,
        "config": {"base_estimator": DecisionTreeClassifier()},
        "fitStrategy": {
            "option": "GridSearch",
            "config": {
                "param_grid": {
                    "clf__base_estimator__max_depth": [1, 5, 10],
                    "clf__n_estimators": [50, 75, 100],
                    "clf__learning_rate": [0.01, 0.1, 1.0],
                },
                "n_jobs": -1,
                "scoring": "accuracy",
            },
        },
    },
    {
        "name": "BaggingClassifier",
        "preprocess": False,
        "config": {"base_estimator": DecisionTreeClassifier()},
        "fitStrategy": {
            "option": "GridSearch",
            "config": {
                "param_grid": {
                    "clf__base_estimator__max_depth": [1, 5, 10],
                    "clf__n_estimators": [50, 75, 100],
                    "clf__max_samples": [0.25, 0.5, 0.75, 1.0],
                    "clf__max_features": [0.25, 0.5, 0.75, 1.0],
                },
                "n_jobs": -1,
                "scoring": "accuracy",
            },
        },
    },
]

hC = handleClassifier()  # construct handleClassifier object
clf = hC.fitClassifiers(dfTrain, classifierList)  # fit

AdaBoostClassifier, GridSearchCV best score = 0.8932332394953603
BaggingClassifier, GridSearchCV best score = 0.86290272130122
LinearDiscriminantAnalysis, CV score = 0.9595766864769054
QuadraticDiscriminantAnalysis, CV score = 0.030267959545407154


# With feature reduction (PCA)

In [4]:
# change the config for a feature reduction
for classifierDict in classifierList:
    classifierDict["preprocess"] = True
    classifierDict["feature"] = {"option": "reduction"}
    classifierDict["fitStrategy"]["option"] = "GridSearch"

    if not "param_grid" in classifierDict["fitStrategy"]["config"]:
        classifierDict["fitStrategy"]["config"]["param_grid"] = dict()

    classifierDict["fitStrategy"]["config"]["param_grid"]["ftr__n_components"] = [
        0.5,
        0.75,
        0.85,
    ]

clfReduction = hC.fitClassifiers(dfTrain, classifierList)  # fit

AdaBoostClassifier, GridSearchCV best score = 0.9192263580439995
BaggingClassifier, GridSearchCV best score = 0.9292670211656763
LinearDiscriminantAnalysis, GridSearchCV best score = 0.9769158586174539
QuadraticDiscriminantAnalysis, GridSearchCV best score = 0.22656657282869358


# With feature selection (Lasso)

In [5]:
# change the config for a feature selection
for classifierDict in classifierList:
    classifierDict["feature"]["option"] = "selection"
    classifierDict["feature"]["config"] = {"estimator": Lasso()}
    del classifierDict["fitStrategy"]["config"]["param_grid"]["ftr__n_components"]
    classifierDict["fitStrategy"]["config"]["param_grid"]["ftr__estimator__alpha"] = [
        1.0,
        2.5,
        5.0,
    ]

clfSelection = hC.fitClassifiers(dfTrain, classifierList)  # fit

AdaBoostClassifier, GridSearchCV best score = 0.85858617453863
BaggingClassifier, GridSearchCV best score = 0.8961422166614537
LinearDiscriminantAnalysis, GridSearchCV best score = 0.9307267229694505
QuadraticDiscriminantAnalysis, GridSearchCV best score = 0.03463663851527474
