<a href="https://colab.research.google.com/github/datametal/gretel-synthetic-data/blob/main/bike_dataset_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --pre pycaret

In [None]:
import pycaret
pycaret.__version__

'3.0.0.rc8'

In [None]:
import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/gretelai/gdpr-helpers/main/data/adventure-works-bike-buying.csv')

In [None]:
data.shape

(16519, 24)

In [None]:
data.dtypes

CustomerID               int64
Title                   object
FirstName               object
MiddleName              object
LastName                object
Suffix                  object
AddressLine1            object
AddressLine2            object
City                    object
StateProvinceName       object
CountryRegionName       object
PostalCode              object
PhoneNumber             object
BirthDate               object
Education               object
Occupation              object
Gender                  object
MaritalStatus           object
HomeOwnerFlag            int64
NumberCarsOwned          int64
NumberChildrenAtHome     int64
TotalChildren            int64
YearlyIncome             int64
BikeBuyer                int64
dtype: object

In [None]:
data.head()

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,BikeBuyer
0,11000,,Jon,V,Yang,,3761 N. 14th St,,Rockhampton,Queensland,...,Bachelors,Professional,M,M,1,0,0,2,137947,0
1,11001,,Eugene,L,Huang,,2243 W St.,,Seaford,Victoria,...,Bachelors,Professional,M,S,0,1,3,3,101141,1
2,11002,,Ruben,,Torres,,5844 Linden Land,,Hobart,Tasmania,...,Bachelors,Professional,M,M,1,1,3,3,91945,0
3,11003,,Christy,,Zhu,,1825 Village Pl.,,North Ryde,New South Wales,...,Bachelors,Professional,F,S,0,1,0,0,86688,0
4,11004,,Elizabeth,,Johnson,,7553 Harness Circle,,Wollongong,New South Wales,...,Bachelors,Professional,F,S,1,4,5,5,92771,1


In [None]:
data.isnull().sum()

CustomerID                  0
Title                   16431
FirstName                   0
MiddleName               6985
LastName                    0
Suffix                  16517
AddressLine1                0
AddressLine2            16243
City                        0
StateProvinceName           0
CountryRegionName           0
PostalCode                  0
PhoneNumber                 0
BirthDate                   0
Education                   0
Occupation                  0
Gender                      0
MaritalStatus               0
HomeOwnerFlag               0
NumberCarsOwned             0
NumberChildrenAtHome        0
TotalChildren               0
YearlyIncome                0
BikeBuyer                   0
dtype: int64

In [None]:
data.columns

Index(['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix',
       'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName',
       'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate',
       'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag',
       'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren',
       'YearlyIncome', 'BikeBuyer'],
      dtype='object')

In [None]:
data['BirthDate'] = pd.to_datetime(data['BirthDate'])

In [None]:
from pycaret.classification import *
s = setup(data, target = 'BikeBuyer', session_id = 123,
          ignore_features = ['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2', 'PhoneNumber'],
          numeric_features = ['TotalChildren', 'NumberChildrenAtHome', 'NumberCarsOwned', 'HomeOwnerFlag', 'YearlyIncome'],
          categorical_features = ['City', 'StateProvinceName', 'CountryRegionName', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'PostalCode'],
          date_features = ['BirthDate'],
          max_encoding_ohe=20)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,BikeBuyer
2,Target type,Binary
3,Original data shape,"(16519, 24)"
4,Transformed data shape,"(16519, 30)"
5,Transformed train set shape,"(11563, 30)"
6,Transformed test set shape,"(4956, 30)"
7,Ignore features,9
8,Ordinal features,2
9,Numeric features,5


In [None]:
%%time
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.7838,0.8341,0.5378,0.7412,0.6231,0.4769,0.4891,0.799
ridge,Ridge Classifier,0.7828,0.0,0.5272,0.7454,0.6173,0.472,0.4859,0.67
et,Extra Trees Classifier,0.7788,0.828,0.5581,0.7146,0.6263,0.4726,0.4802,1.761
nb,Naive Bayes,0.7671,0.8075,0.4549,0.7451,0.5644,0.4182,0.442,0.768
rf,Random Forest Classifier,0.7575,0.8102,0.3763,0.781,0.5076,0.372,0.4157,1.872
qda,Quadratic Discriminant Analysis,0.757,0.8085,0.6127,0.6415,0.6262,0.4465,0.4472,0.773
ada,Ada Boost Classifier,0.7433,0.7801,0.3763,0.7209,0.4939,0.3437,0.3768,1.299
lr,Logistic Regression,0.6995,0.6743,0.2379,0.6214,0.342,0.197,0.2349,3.697
dt,Decision Tree Classifier,0.699,0.6021,0.3127,0.5886,0.4057,0.2306,0.2517,0.805
gbc,Gradient Boosting Classifier,0.6721,0.7492,0.1862,0.5543,0.2728,0.1207,0.156,2.308


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

CPU times: user 7.83 s, sys: 514 ms, total: 8.35 s
Wall time: 3min 13s


In [None]:
predict_model(best);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7839,0.8378,0.551,0.7328,0.629,0.4809,0.4906


# Anonymous Data

In [None]:
import pandas as pd
data_anonymous = pd.read_csv('adventure-works-bike-buying-synthetic_data.csv')

In [None]:
data_anonymous.shape

(16519, 24)

In [None]:
data_anonymous.head()

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,BikeBuyer
0,0178b12f,,Cathy,Ashley,Gordon,,0521 Tammy Ramp Suite 678,8935 Burke Streets,Bobigny,55caead9,...,High School,Manual,M,M,0,2,0,2,14755,0
1,d7decf4a,,Barbara,,Watson,,3838 Catherine Field Suite 456,8935 Burke Streets,San Gabriel,2e735720,...,Partial College,Professional,M,M,1,2,4,2,91955,0
2,49e6dabd,,Sarah,Elizabeth,Hanson,,1489 Grimes Lodge Apt. 391,8935 Burke Streets,National City,f6ecfa6e,...,Bachelors,Manual,F,S,0,2,0,0,13273,0
3,2776fb67,,Russell,Caitlyn,Hanson,,068 Knox Squares,8935 Burke Streets,Gold Coast,b48517a3,...,Bachelors,Manual,M,M,0,1,3,2,134081,0
4,1d3fb317,,Cory,,Whitaker,,17475 Laura Court,8935 Burke Streets,Coffs Harbour,22cf0446,...,Graduate Degree,Management,F,M,1,0,0,3,121666,0


In [None]:
data_anonymous.columns

Index(['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix',
       'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName',
       'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate',
       'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag',
       'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren',
       'YearlyIncome', 'BikeBuyer'],
      dtype='object')

In [None]:
data_anonymous.dtypes

CustomerID              object
Title                   object
FirstName               object
MiddleName              object
LastName                object
Suffix                  object
AddressLine1            object
AddressLine2            object
City                    object
StateProvinceName       object
CountryRegionName       object
PostalCode              object
PhoneNumber             object
BirthDate               object
Education               object
Occupation              object
Gender                  object
MaritalStatus           object
HomeOwnerFlag            int64
NumberCarsOwned          int64
NumberChildrenAtHome     int64
TotalChildren            int64
YearlyIncome             int64
BikeBuyer                int64
dtype: object

In [None]:
data_anonymous.isnull().sum()

CustomerID                  0
Title                   13575
FirstName                   0
MiddleName               5773
LastName                    0
Suffix                  14995
AddressLine1                0
AddressLine2                0
City                        0
StateProvinceName           0
CountryRegionName           0
PostalCode                  0
PhoneNumber                 0
BirthDate                   0
Education                   0
Occupation                  0
Gender                      0
MaritalStatus               0
HomeOwnerFlag               0
NumberCarsOwned             0
NumberChildrenAtHome        0
TotalChildren               0
YearlyIncome                0
BikeBuyer                   0
dtype: int64

In [None]:
data_anonymous['BirthDate'] = pd.to_datetime(data_anonymous['BirthDate'])

In [None]:
from pycaret.classification import *
s = setup(data_anonymous, target = 'BikeBuyer', session_id = 123,
          ignore_features = ['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2', 'PhoneNumber'],
          numeric_features = ['TotalChildren', 'NumberChildrenAtHome', 'NumberCarsOwned', 'HomeOwnerFlag', 'YearlyIncome'],
          categorical_features = ['City', 'StateProvinceName', 'CountryRegionName', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'PostalCode'],
          date_features = ['BirthDate'],
          max_encoding_ohe=20)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,BikeBuyer
2,Target type,Binary
3,Original data shape,"(16519, 24)"
4,Transformed data shape,"(16519, 30)"
5,Transformed train set shape,"(11563, 30)"
6,Transformed test set shape,"(4956, 30)"
7,Ignore features,9
8,Ordinal features,2
9,Numeric features,5


In [None]:
%%time
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.7702,0.7717,0.3546,0.6242,0.4518,0.3202,0.3409,0.8
ridge,Ridge Classifier,0.7696,0.0,0.3096,0.6446,0.4178,0.2957,0.3268,0.683
et,Extra Trees Classifier,0.7591,0.7451,0.3161,0.5928,0.412,0.2778,0.2997,1.868
nb,Naive Bayes,0.7537,0.7136,0.3057,0.5748,0.3982,0.2614,0.2824,0.781
qda,Quadratic Discriminant Analysis,0.7528,0.7411,0.4296,0.5485,0.4815,0.3224,0.3268,0.761
rf,Random Forest Classifier,0.741,0.7284,0.2317,0.5846,0.3222,0.1958,0.2345,1.938
lr,Logistic Regression,0.7329,0.6171,0.0113,0.5361,0.0221,0.0108,0.044,2.583
dummy,Dummy Classifier,0.7327,0.5,0.0,0.0,0.0,0.0,0.0,0.762
ada,Ada Boost Classifier,0.7299,0.6683,0.1993,0.5403,0.278,0.1529,0.1894,1.217
gbc,Gradient Boosting Classifier,0.6986,0.6574,0.0835,0.3296,0.1113,0.0095,0.0193,2.268


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

CPU times: user 8.16 s, sys: 448 ms, total: 8.6 s
Wall time: 3min 7s


In [None]:
predict_model(best);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.7746,0.7816,0.3728,0.6333,0.4694,0.3382,0.3574


In [None]:
print('done')

done


In [None]:
tune_model(best, optimize = 'AUC', n_iter = 50)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7701,0.7706,0.3333,0.6319,0.4364,0.309,0.3339
1,0.7701,0.7666,0.3204,0.6387,0.4267,0.3022,0.3304
2,0.7701,0.7742,0.3548,0.625,0.4527,0.3209,0.3415
3,0.769,0.775,0.3754,0.6105,0.4649,0.3282,0.344
4,0.7734,0.7839,0.356,0.6358,0.4564,0.3274,0.3494
5,0.7716,0.7573,0.3398,0.6364,0.443,0.3157,0.3403
6,0.7647,0.7656,0.3657,0.5979,0.4538,0.3148,0.3302
7,0.7785,0.7826,0.3754,0.648,0.4754,0.3474,0.3683
8,0.7708,0.7809,0.3786,0.6158,0.4689,0.3332,0.3492
9,0.7621,0.7611,0.343,0.5955,0.4353,0.2982,0.3164


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 36 candidates, totalling 360 fits


LinearDiscriminantAnalysis(covariance_estimator=None, n_components=None,
                           priors=None, shrinkage='auto', solver='lsqr',
                           store_covariance=False, tol=0.0001)