# Import modules

In [1]:
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn import svm
from scipy.stats import loguniform
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.linear_model import ElasticNetCV, LassoCV, LinearRegression, RidgeCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

# Import the pickle

In [2]:
df_arbres = pd.read_pickle('./data/arbres.pkl')

In [3]:
df_arbres.columns

Index(['sous_categorie', 'code_parent', 'adr_secteur', 'genre_bota', 'espece',
       'stadededeveloppement', 'anneedeplantation', 'collectivite',
       'hauteurarbre', 'portarbre', 'latitude', 'longitude'],
      dtype='object')

In [4]:
# col = ['sous_categorie', 'genre_bota', 'stadededeveloppement', 'anneedeplantation', 'latitude', 'longitude', 'espece', 'hauteurarbre']
# df_arbres = df_arbres[col]

In [5]:
df_arbres = df_arbres.drop(['code_parent', 'collectivite', 'portarbre', 'adr_secteur'], axis=1)
#df_arbres = df_arbres.drop(['latitude', 'longitude'], axis=1)


In [6]:
df_arbres.head()

Unnamed: 0,sous_categorie,genre_bota,espece,stadededeveloppement,anneedeplantation,hauteurarbre,latitude,longitude
0,ESP151,Prunus,serrulata,Arbre jeune,2015.0,,45.167098,5.740132
1,ESP151,Prunus,serrulata,Arbre jeune,2015.0,,45.167107,5.7402
2,ESP151,Prunus,serrulata,Arbre jeune,2015.0,,45.167115,5.740266
3,ESP151,Prunus,serrulata,Arbre jeune,2015.0,,45.167127,5.740349
4,ESP151,Prunus,serrulata,Arbre jeune,2015.0,,45.167144,5.740471


# Split the data frame into testing and training sets

In [7]:
pred_col = 'anneedeplantation'

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df_arbres.drop(pred_col, axis=1),
                                                    df_arbres[pred_col], 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    stratify=df_arbres[pred_col])

# Explore the training data further

In [9]:
# create copy of the training data set to explore
df_explore = X_train.copy()

In [10]:
df_explore.isna().sum()

sous_categorie             0
genre_bota                32
espece                   986
stadededeveloppement    2006
hauteurarbre            4652
latitude                   0
longitude                  0
dtype: int64

In [11]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

col_num = df_explore.select_dtypes(include=numerics).columns
col_cat = df_explore.select_dtypes(exclude=numerics).columns

In [12]:
col_cat

Index(['sous_categorie', 'genre_bota', 'espece', 'stadededeveloppement',
       'hauteurarbre'],
      dtype='object')

In [13]:
col_num

Index(['latitude', 'longitude'], dtype='object')

## Fill empty values

In [14]:
imp = SimpleImputer(strategy='most_frequent')
df_explore[col_cat] = imp.fit_transform(df_explore[col_cat])
X_test[col_cat] = imp.transform(X_test[col_cat])
df_explore.isna().sum()

sous_categorie          0
genre_bota              0
espece                  0
stadededeveloppement    0
hauteurarbre            0
latitude                0
longitude               0
dtype: int64

## Standardize the quantative data

In [15]:
# create a StandardScaler object
scaler = StandardScaler().set_output()

# fit and transform the data
df_explore[col_num] = scaler.fit_transform(df_explore[col_num])
X_test[col_num] = scaler.transform(X_test[col_num])

## Encode the qualitative data

In [16]:
df_explore[col_cat].nunique().sort_values(ascending=False)

espece                  276
genre_bota              115
sous_categorie            4
stadededeveloppement      3
hauteurarbre              3
dtype: int64

In [17]:
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
test = enc.fit(df_explore[col_cat])

# Create an encoder 
encoder= OneHotEncoder(sparse_output=False, handle_unknown='ignore') 
 
# Train encoder using multiple columns
encoder.fit(df_explore[col_cat]) 
 
# Get encoded column names 
encoded_columns=encoder.get_feature_names_out() 
 
# Transform data using the trained encoder 
encoded_data = encoder.transform(df_explore[col_cat])
encoded_data2 = encoder.transform(X_test[col_cat])
 
# Create a dataframe using the encoded data 
encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)
encoded_df2 = pd.DataFrame(encoded_data2, columns=encoded_columns) 

df_explore = df_explore.reset_index().drop('index', axis=1)
df_explore = df_explore.drop(col_cat, axis=1)
X_test = X_test.reset_index().drop('index', axis=1)
X_test = X_test.drop(col_cat, axis=1)

df_explore = df_explore.join(encoded_df)
X_test = X_test.join(encoded_df2)

In [18]:
# oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=280)
# df_explore[col_cat] = oe.fit_transform(df_explore[col_cat])
# X_train_pip = oe.transform(X_train)
# X_test_pip = oe.transform(X_test)

In [19]:
X_train = df_explore.copy()

In [20]:
X_train

Unnamed: 0,latitude,longitude,sous_categorie_ESP065,sous_categorie_ESP151,sous_categorie_ESP174,sous_categorie_ESP187,genre_bota_Abies,genre_bota_Acacia,genre_bota_Acer,genre_bota_Aesculus,...,espece_wallichiana,espece_wislizeni,espece_yedoensis,espece_zoeschense,stadededeveloppement_Arbre adulte,stadededeveloppement_Arbre jeune,stadededeveloppement_Arbre vieillissant,hauteurarbre_Moins de 10 m,hauteurarbre_Plus de 20 m,hauteurarbre_de 10 m à 20 m
0,-0.088278,-0.422605,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.872776,0.340067,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.171091,-0.458612,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,-0.722987,1.046268,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,-0.781879,0.471514,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23615,-0.262789,-0.078327,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
23616,-1.069951,0.665322,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
23617,-1.152973,-0.098248,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
23618,-0.919970,-1.585162,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


# Linear regression

In [21]:
regressor = LinearRegression().fit(X_train, y_train)

In [22]:
y_pred_train = regressor.predict(X_train)
y_pred_test = regressor.predict(X_test)

print(f"Mean squared error: {mean_squared_error(y_test, y_pred_test):.2f}")
print(f"Coefficient of determination: {r2_score(y_test, y_pred_test):.2f}")

Mean squared error: 164.35
Coefficient of determination: 0.50


In [23]:
print('-'*35)
print('Prediction for the training data')
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('-'*35)
print('Prediction for the test data')
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
MSE: 154.31829990304612
RMSE: 12.422491694625766
-----------------------------------
Prediction for the test data
MSE: 164.353143633788
RMSE: 12.82002900284504
-----------------------------------


## With cross_validation

In [24]:
lin_reg = LinearRegression()

cv_scores = cross_val_score(lin_reg, X_train, y_train, cv=5, scoring='neg_mean_squared_error') 

print('-'*35)
print('Prediction for the training data')
print('mean MSE:',np.mean(-cv_scores))
print('std MSE:',np.std(-cv_scores))
print('mean RMSE:',np.sqrt(np.mean(-cv_scores)))

lin_reg.fit(X_train, y_train)

# predict y values for testing data set
y_pred_test = lin_reg.predict(X_test)

# evaluate MSE
print('-'*35)
print('Prediction for the test data')
mse_test = mean_squared_error(y_test, y_pred_test)
print('test MSE: {0}'.format(mse_test))
print('test RMSE: {0}'.format(np.sqrt(mse_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
mean MSE: 161.57361154422205
std MSE: 2.4815273631180323
mean RMSE: 12.711160904662565
-----------------------------------
Prediction for the test data
test MSE: 164.353143633788
test RMSE: 12.82002900284504
-----------------------------------


# Lasso

In [25]:
model_l = Lasso()

model_l.fit(X_train, y_train)
y_pred_test = model_l.predict(X_test)
y_pred_train = model_l.predict(X_train)

print('-'*35)
print('Prediction for the training data')
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('-'*35)
print('Prediction for the test data')
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
MSE: 250.2658497981459
RMSE: 15.819792975830811
-----------------------------------
Prediction for the test data
MSE: 249.7994837260371
RMSE: 15.805046147545317
-----------------------------------


In [26]:
model_lcv = LassoCV(alphas=100)

model_lcv.fit(X_train, y_train)
y_pred_test = model_lcv.predict(X_test)
y_pred_train = model_lcv.predict(X_train)

print('-'*35)
print('Prediction for the training data')
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('-'*35)
print('Prediction for the test data')
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
MSE: 159.62786903202374
RMSE: 12.63439230956613
-----------------------------------
Prediction for the test data
MSE: 163.42525219878544
RMSE: 12.783788648080249
-----------------------------------


# Ridge

In [27]:
model_r = Ridge(alpha=0.1)

model_r.fit(X_train, y_train)
y_pred_test = model_r.predict(X_test)
y_pred_train = model_r.predict(X_train)

print('-'*35)
print('Prediction for the training data')
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('-'*35)
print('Prediction for the test data')
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
MSE: 154.3748418335047
RMSE: 12.424767274822683
-----------------------------------
Prediction for the test data
MSE: 162.17504740787226
RMSE: 12.734796716393719
-----------------------------------


# Elastic net

In [28]:
model_en = ElasticNet(alpha=0.1)

model_en.fit(X_train, y_train)
y_pred_test = model_en.predict(X_test)
y_pred_train = model_en.predict(X_train)

print('-'*35)
print('Prediction for the training data')
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('-'*35)
print('Prediction for the test data')
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
MSE: 210.99720275324552
RMSE: 14.525742760810736
-----------------------------------
Prediction for the test data
MSE: 211.652811925058
RMSE: 14.548292405813749
-----------------------------------


# Decision tree

In [29]:
model_dt = DecisionTreeRegressor(min_samples_split = 100, splitter='best', random_state=42)

model_dt.fit(X_train, y_train)
y_pred_test = model_dt.predict(X_test)
y_pred_train = model_dt.predict(X_train)

print('-'*35)
print('Prediction for the training data')
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('-'*35)
print('Prediction for the test data')
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
MSE: 76.96121570280717
RMSE: 8.772754168606753
-----------------------------------
Prediction for the test data
MSE: 107.41926951964172
RMSE: 10.364326775996679
-----------------------------------


## With cross_validation

In [30]:
model_dt = DecisionTreeRegressor(min_samples_split = 100, splitter='best', random_state=42)

cv_scores = cross_val_score(model_dt, X_train, y_train, cv=5, scoring='neg_mean_squared_error') 

print('-'*35)
print('Prediction for the training data')
print('mean MSE:',np.mean(-cv_scores))
print('std MSE:',np.std(-cv_scores))
print('mean RMSE:',np.sqrt(np.mean(-cv_scores)))

model_dt.fit(X_train, y_train)

# predict y values for testing data set
y_pred_test = model_dt.predict(X_test)

# evaluate MSE
print('-'*35)
print('Prediction for the test data')
mse_test = mean_squared_error(y_test, y_pred_test)
print('test MSE: {0}'.format(mse_test))
print('test RMSE: {0}'.format(np.sqrt(mse_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
mean MSE: 118.21063687034105
std MSE: 3.7648175679096956
mean RMSE: 10.872471516188996
-----------------------------------
Prediction for the test data
test MSE: 107.41926951964172
test RMSE: 10.364326775996679
-----------------------------------


# Random Forest

In [31]:
model_rf = RandomForestRegressor(n_estimators=10, bootstrap=False, min_samples_split = 100, random_state=42)

model_rf.fit(X_train, y_train)
y_pred_test = model_rf.predict(X_test)
y_pred_train = model_rf.predict(X_train)

print('-'*35)
print('Prediction for the training data')
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('-'*35)
print('Prediction for the test data')
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('-'*35)


-----------------------------------
Prediction for the training data
MSE: 76.95971024287866
RMSE: 8.772668365034589
-----------------------------------
Prediction for the test data
MSE: 106.76413382957656
RMSE: 10.332673121200369
-----------------------------------


## With cross_validation

In [32]:
model_rf = RandomForestRegressor(n_estimators=10, bootstrap=False, min_samples_split = 100, random_state=42)

cv_scores = cross_val_score(model_rf, X_train, y_train, cv=5, scoring='neg_mean_squared_error') 

print('-'*35)
print('Prediction for the training data')
print('mean MSE:',np.mean(-cv_scores))
print('std MSE:',np.std(-cv_scores))
print('mean RMSE:',np.sqrt(np.mean(-cv_scores)))

model_rf.fit(X_train, y_train)

# predict y values for testing data set
y_pred_test = model_rf.predict(X_test)

# evaluate MSE
print('-'*35)
print('Prediction for the test data')
mse_test = mean_squared_error(y_test, y_pred_test)
print('test MSE: {0}'.format(mse_test))
print('test RMSE: {0}'.format(np.sqrt(mse_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
mean MSE: 117.53076159290109
std MSE: 3.9847980204559685
mean RMSE: 10.841160527955532
-----------------------------------
Prediction for the test data
test MSE: 106.76413382957656
test RMSE: 10.332673121200369
-----------------------------------


# Support Vector Machine

In [33]:
# model_svm = svm.SVR()

# model_svm.fit(X_train, y_train)
# y_pred_test = model_svm.predict(X_test)
# y_pred_train = model_svm.predict(X_train)

# print('-'*35)
# print('Prediction for the training data')
# print('MSE:', mean_squared_error(y_train, y_pred_train))
# print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))
# print('-'*35)
# print('Prediction for the test data')
# print('MSE:', mean_squared_error(y_test, y_pred_test))
# print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))
# print('-'*35)

# Bayesian ridge regression

In [34]:
from sklearn import linear_model

model_br = linear_model.BayesianRidge()

model_br.fit(X_train, y_train)
y_pred_test = model_br.predict(X_test)
y_pred_train = model_br.predict(X_train)

print('-'*35)
print('Prediction for the training data')
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('-'*35)
print('Prediction for the test data')
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
MSE: 155.2090561706318
RMSE: 12.45829266675943
-----------------------------------
Prediction for the test data
MSE: 160.8622580578977
RMSE: 12.683148586131825
-----------------------------------


# Gradient boosting regression

In [35]:
model_gb = GradientBoostingRegressor(n_estimators=250, learning_rate=0.1, random_state=42)

model_gb.fit(X_train, y_train)
y_pred_test = model_gb.predict(X_test)
y_pred_train = model_gb.predict(X_train)

print('-'*35)
print('Prediction for the training data')
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('-'*35)
print('Prediction for the test data')
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
MSE: 132.6200835054261
RMSE: 11.516079346089366
-----------------------------------
Prediction for the test data
MSE: 138.26230799692624
RMSE: 11.758499393924644
-----------------------------------


# Hist gradient boosting regression

In [36]:
model_hgb= HistGradientBoostingRegressor(learning_rate=0.1, random_state=42)

model_hgb.fit(X_train, y_train)
y_pred_test = model_hgb.predict(X_test)
y_pred_train = model_hgb.predict(X_train)

print('-'*35)
print('Prediction for the training data')
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('-'*35)
print('Prediction for the test data')
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
MSE: 92.71560821972642
RMSE: 9.628894444313243
-----------------------------------
Prediction for the test data
MSE: 99.27076665600632
RMSE: 9.963471616660847
-----------------------------------


## With cross_validation

In [37]:
model_hgb= HistGradientBoostingRegressor(learning_rate=0.1, random_state=42)

cv_scores = cross_val_score(model_hgb, X_train, y_train, cv=5, scoring='neg_mean_squared_error') 

print('-'*35)
print('Prediction for the training data')
print('mean MSE:',np.mean(-cv_scores))
print('std MSE:',np.std(-cv_scores))
print('mean RMSE:',np.sqrt(np.mean(-cv_scores)))

model_hgb.fit(X_train, y_train)

# predict y values for testing data set
y_pred_test = model_hgb.predict(X_test)

# evaluate MSE
print('-'*35)
print('Prediction for the test data')
mse_test = mean_squared_error(y_test, y_pred_test)
print('test MSE: {0}'.format(mse_test))
print('test RMSE: {0}'.format(np.sqrt(mse_test)))
print('-'*35)

-----------------------------------
Prediction for the training data
mean MSE: 104.12262727565106
std MSE: 1.7679667347746886
mean RMSE: 10.204049552783006
-----------------------------------
Prediction for the test data
test MSE: 99.27076665600632
test RMSE: 9.963471616660847
-----------------------------------


# Dummy model

In [38]:
model_d = DummyClassifier(strategy='most_frequent')

model_d.fit(X_train, y_train)
y_pred_test = model_d.predict(X_test)
y_pred_train = model_d.predict(X_train)

# Calculate and print the model's accuracy on the test data
print(f"Accuracy: {accuracy_score(y_test, y_pred_test)*100:.4f}%")
print('-'*35)
print('Prediction for the training data')
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('-'*35)
print('Prediction for the test data')
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('-'*35)

Accuracy: 11.4121%
-----------------------------------
Prediction for the training data
MSE: 568.8447078746825
RMSE: 23.85046556934859
-----------------------------------
Prediction for the test data
MSE: 568.856078564172
RMSE: 23.85070394273871
-----------------------------------


Scratch work

In [39]:
# le = LabelEncoder()
# for col in col_cat:
#     df_explore[col] = le.fit_transform(df_explore[col])

In [40]:
# enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# test = enc.fit(df_explore[col_num])

# # Create an encoder 
# encoder= OneHotEncoder(sparse_output=False, handle_unknown='ignore') 
 
# # Train encoder using multiple columns
# encoder.fit(df_explore[col_num]) 
 
# # Get encoded column names 
# encoded_columns=encoder.get_feature_names_out() 
 
# # Transform data using the trained encoder 
# encoded_data = encoder.transform(df_explore[col_num]) 
 
# # Create a dataframe using the encoded data 
# encoded_df=pd.DataFrame(encoded_data, columns=encoded_columns) 
# encoded_df