In [10]:
# Library for loading dataset
import pandas as pd

# Library for preprocessing
from sklearn.preprocessing import LabelEncoder

# Library for selection
from sklearn.model_selection import train_test_split

# Library for AutoML
import autosklearn.regression as auto_r

In [11]:
# Load dataset
df = pd.read_csv('clean_diamonds.csv')

In [12]:
df

Unnamed: 0,index,carat,cut,color,clarity,depth,table,price,x,y,z
0,0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53938,53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74
53939,53939,0.75,Ideal,D,SI2,62.2,55.0,2757,5.83,5.87,3.64
53940,53940,0.71,Premium,E,SI1,60.5,55.0,2756,5.79,5.74,3.49
53941,53941,0.71,Premium,F,SI1,59.8,62.0,2756,5.74,5.73,3.43


In [13]:
# Drop index, table and depth column
df.drop(columns=['index', 'table', 'depth'], inplace=True)

In [14]:
# Encode objects
le = LabelEncoder()

df['cut'] = le.fit_transform(df['cut'])
df['color'] = le.fit_transform(df['color'])
df['clarity'] = le.fit_transform(df['clarity'])

features = ['carat','cut','color','clarity','x','y','z']

# Declare X and Y variables
x = df[features]     
y = df['price']

# Train set and Test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)

In [15]:
# Prepare AutoML
automl = auto_r.AutoSklearnRegressor(
    time_left_for_this_task=180,
    per_run_time_limit=30,
)
automl.fit(x_train, y_train, dataset_name='clean_diamonds.csv')

  self.metafeatures = self.metafeatures.append(metafeatures)
  self.algorithm_runs[metric].append(runs)


In [16]:
# General information about results obtained
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: clean_diamonds.csv
  Metric: r2
  Best validation score: 0.979692
  Number of target algorithm runs: 16
  Number of successful target algorithm runs: 8
  Number of crashed target algorithm runs: 2
  Number of target algorithms that exceeded the time limit: 2
  Number of target algorithms that exceeded the memory limit: 4



In [17]:
# Excecuted models
print(automl.leaderboard())

          rank  ensemble_weight                 type      cost   duration
model_id                                                                 
7            1             0.34    gradient_boosting  0.020308   4.394661
15           2             0.18    gradient_boosting  0.021032   5.075893
9            3             0.28    gradient_boosting  0.021092  25.020621
14           4             0.10             adaboost  0.023788   7.349903
8            5             0.10  k_nearest_neighbors  0.029353   0.705055


In [18]:
from pprint import pprint

# Hyperparams obtained in the models
pprint(automl.show_models(), indent=0)

{7: {'cost': 0.020308099948580405,
   'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fc8d50d3760>,
   'ensemble_weight': 0.34,
   'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fc889acdcc0>,
   'model_id': 7,
   'rank': 1,
   'regressor': <autosklearn.pipeline.components.regression.RegressorChoice object at 0x7fc889acfd30>,
   'sklearn_regressor': HistGradientBoostingRegressor(l2_regularization=2.208787572338781e-05,
                              learning_rate=0.036087332404571744,
                              loss='least_squares', max_iter=512,
                              max_leaf_nodes=64, min_samples_leaf=3,
                              n_iter_no_change=18, random_state=1,
                              validation_fraction=None, warm_start=True)},
8: {'cost': 0.029353492711143847,
   'data_preprocessor': <autosklearn.pipeline.components.data_preproc