# Miscelenious

In [None]:
# imports
%load_ext autoreload
%autoreload 2
import os
import sys

import matplotlib as mpl
from sklearn.model_selection import train_test_split
from sklearn.tree import export_text, DecisionTreeClassifier, DecisionTreeRegressor

import numpy as np
import pandas as pd

import imodels
from imodels.util.data_util import get_clean_dataset # this was used to get the datasets

from copy import deepcopy

# cross-validation of models
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# timing algorithm execution
import time

: 

In [None]:
# Repositories used for tests (location: paper_autors_repo/config/shrinkage/models.py) - tuki sem skopiral samo ker je lazje najti

DATASETS_CLASSIFICATION = [
    # classification datasets from original random forests paper
    # page 9: https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf
    ("heart", "heart", 'imodels'),
    ("breast-cancer", "breast_cancer", 'imodels'), # this is the wrong breast-cancer dataset (https://new.openml.org/search?type=data&sort=runs&id=13&status=active)
    ("haberman", "haberman", 'imodels'),
    ("ionosphere", "ionosphere", 'pmlb'),
    ("diabetes", "diabetes", "pmlb"),
    ("german-credit", "german", "pmlb"),
    ("juvenile", "juvenile_clean", 'imodels'),
    ("recidivism", "compas_two_year_clean", 'imodels'),
]

DATASETS_REGRESSION = [
    ('friedman1', 'friedman1', 'synthetic'),
    ('friedman3', 'friedman3', 'synthetic'),
    ("diabetes-regr", "diabetes", 'sklearn'),
    # missing red-wine and geographical music added later
    ('abalone', '183', 'openml'),
    ("satellite-image", "294_satellite_image", 'pmlb'),
    ("california-housing", "california_housing", 'sklearn'),
]

# hyperparameters used in paper

# number of leafs used in paper
num_of_leaves = [2, 4, 8, 12, 15, 20, 24, 28, 30, 32]

# regularization parameter
reg_hs = [0.1, 1.0, 10.0, 25.0, 50.0, 100.0]

tasks_classification = {}

# load datasets
for task in DATASETS_CLASSIFICATION:
    X, y, feature_names = get_clean_dataset(task[1], data_source = task[2])
    df = pd.DataFrame(X, columns=feature_names)
    df["label"] = y
    tasks_classification[task[0]] = df
    
tasks_regression = {}

# load datasets
for task in DATASETS_REGRESSION:
    X, y, feature_names = get_clean_dataset(task[1], data_source = task[2])
    df = pd.DataFrame(X, columns=feature_names)
    df["label"] = y
    tasks_regression[task[0]] = df
    
# add missing datasets (not present in code)
# red-wine dataset
wine = pd.read_csv("data/missing_data/winequality-red.csv", sep = ";")
wine = (wine-wine.mean())/wine.std()
wine.rename(columns = {'quality':'label'}, inplace = True)

tasks_regression["red-wine"] = wine
DATASETS_REGRESSION.append(("red-wine", "red-wine", "None"))

# geographical-music dataset (omitted for now since it is a 2D output dataset (RF should and can predict latitude/longitude))
# 116 & 117 are part of label
music = pd.read_csv("data/missing_data/geo-music-big.txt", header = None)
music.rename(columns = {116: 'label1', 117: 'label2'}, inplace = True)
tasks_regression["music"] = music
DATASETS_REGRESSION.insert(0, ("music", "music", "None"))