In [13]:
import pandas as pd
from sklearn import preprocessing
import math
from sklearn.metrics import mean_squared_error
from autogluon.tabular import TabularPredictor
import tpot
from tpot import TPOTRegressor

In [14]:
df_train = pd.read_csv('diabetes_data.csv')

X = df_train.drop(['Diabetes'], axis = 1)
y = df_train['Diabetes']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)
X_train.head(5)

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP
34718,8.0,0.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
23902,11.0,0.0,1.0,1.0,30.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0
24430,6.0,0.0,0.0,1.0,34.0,0.0,1.0,0.0,1.0,1.0,0.0,3.0,0.0,4.0,0.0,0.0,1.0
42615,11.0,0.0,1.0,1.0,31.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,20.0,0.0,0.0,0.0,1.0
25820,2.0,1.0,0.0,1.0,23.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0


In [15]:
GENERATIONS = 3
POP_SIZE = 100
CV = 3

tpot = TPOTRegressor(
    generations=GENERATIONS,
    population_size=POP_SIZE,
    scoring='neg_mean_squared_error',
    n_jobs= -1,
    cv=CV,
    verbosity=2,
)

tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

Optimization Progress:   0%|          | 0/400 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.16871760143211947

Generation 2 - Current best internal CV score: -0.16871760143211947

Generation 3 - Current best internal CV score: -0.16845676801648235

Best pipeline: ElasticNetCV(RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.2, min_samples_leaf=16, min_samples_split=13, n_estimators=100), l1_ratio=0.25, tol=0.1)
-0.16779928282118886




In [16]:
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    print(f'{idx}. {transform}')


Best pipeline steps:
1. StackingEstimator(estimator=RandomForestRegressor(max_features=0.2,
                                                  min_samples_leaf=16,
                                                  min_samples_split=13,
                                                  random_state=42))
2. ElasticNetCV(l1_ratio=0.25, random_state=42, tol=0.1)


In [17]:
y_pred = tpot.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
print('RMSE: %f' % rmse)



RMSE: 0.409633


In [22]:
label = 'Diabetes' 
eval_metric = 'root_mean_squared_error' 


In [20]:
predictor = TabularPredictor(label=label,problem_type = 'regression', eval_metric=eval_metric, verbosity=2).fit(df_train, presets='best_quality',auto_stack = True)
model_to_use = predictor.get_model_best()

No path specified. Models will be saved in: "AutogluonModels\ag-20240918_160231"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.11.8
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          16
Memory Avail:       41.28 GB / 63.42 GB (65.1%)
Disk Space Avail:   102.08 GB / 930.43 GB (11.0%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfit

In [21]:
predictor.leaderboard(df_train, silent=True).head(20)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForest_r195_BAG_L1,-0.198272,-0.437143,root_mean_squared_error,1.007042,2.362923,4.445999,1.007042,2.362923,4.445999,1,True,24
1,RandomForestMSE_BAG_L1,-0.198711,-0.438641,root_mean_squared_error,0.920044,2.328519,5.997137,0.920044,2.328519,5.997137,1,True,5
2,ExtraTrees_r42_BAG_L1,-0.200784,-0.435892,root_mean_squared_error,1.247041,2.489548,3.882206,1.247041,2.489548,3.882206,1,True,20
3,ExtraTreesMSE_BAG_L1,-0.211379,-0.43315,root_mean_squared_error,1.034298,2.203579,3.727095,1.034298,2.203579,3.727095,1,True,7
4,ExtraTrees_r172_BAG_L1,-0.343639,-0.416214,root_mean_squared_error,0.8404,2.070194,3.857585,0.8404,2.070194,3.857585,1,True,34
5,XGBoost_r33_BAG_L1,-0.375759,-0.410545,root_mean_squared_error,3.276903,2.578988,8.610646,3.276903,2.578988,8.610646,1,True,19
6,LightGBM_r161_BAG_L1,-0.388519,-0.409641,root_mean_squared_error,4.296053,2.702515,10.04057,4.296053,2.702515,10.04057,1,True,38
7,LightGBMLarge_BAG_L1,-0.394257,-0.409507,root_mean_squared_error,1.120996,0.502991,3.437095,1.120996,0.502991,3.437095,1,True,11
8,LightGBM_r188_BAG_L1,-0.395497,-0.409414,root_mean_squared_error,1.417999,0.868523,4.452109,1.417999,0.868523,4.452109,1,True,25
9,XGBoost_r194_BAG_L1,-0.398145,-0.409515,root_mean_squared_error,0.502996,0.127994,2.23308,0.502996,0.127994,2.23308,1,True,33


#### Got a RMSE score of **2.088** using TPOT (Gen = 3, Pop Size = 100, CV = 5)
#### Increase the Generations to 10 or 20 maybe and CV to 10 for better score.