# Introduction to Scikit-Learn (SKLearn)

This notebook demonstrates some of the most useful functions of the beautiful Sci-Kit Learn Library. 

What we're going to cover:

0. An end-to-end Scikit-Learn Workflow
1. Getting the data ready
2. Choose the right estimator (model, algorithm)
3. Fit the model/estimator/algorithm and use it to make predictions on the data
4. Evaluating a model
5. Improve a model
6. Save and load a trained model
7. Put it all together


In [90]:
# Listify the Contents of what we're learning... 
what_we_are_covering = [
    "0. An end-to-end Scikit-Learn Workflow", 
    "1. Getting the data ready",
    "2. Choose the right estimator (model, algorithm)",
    "3. Fit the model/estimator/algorithm and use it to make predictions on the data",
    "4. Evaluating a model",
    "5. Improve a model",
    "6. Save and load a trained model",
    "7. Put it all together",
];

# 0. An End-To-End Scikit-Learn Workflow


In [91]:
#1. Get the data ready

# Standar Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease, heart_disease[:5]

(     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
 0     63    1   3       145   233    1        0      150      0      2.3   
 1     37    1   2       130   250    0        1      187      0      3.5   
 2     41    0   1       130   204    0        0      172      0      1.4   
 3     56    1   1       120   236    0        1      178      0      0.8   
 4     57    0   0       120   354    0        1      163      1      0.6   
 ..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
 298   57    0   0       140   241    0        1      123      1      0.2   
 299   45    1   3       110   264    0        1      132      0      1.2   
 300   68    1   0       144   193    1        1      141      0      3.4   
 301   57    1   0       130   131    0        1      115      1      1.2   
 302   57    0   1       130   236    0        0      174      0      0.0   
 
      slope  ca  thal  target  
 0        0   0     1       1  
 1        

In [92]:
# Create X, which is known as the Features Matrix / Data / Features Variable
X = heart_disease.drop("target", axis = 1)

# Create Y, which is known as the Labels Matrix, Labels
Y = heart_disease["target"]

In [93]:
# 2. Choose the Right Model / Estimator / Algorithm  and Hyperparmaters (the dials on the Model to make it better/worse)
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# We'll use the default Hyperparameters
clf.get_params()



{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [94]:
# 3. Fit the model to the Training Data

from sklearn.model_selection import train_test_split

X_train, X_test, Y_Train, Y_test = train_test_split(X,Y, test_size= 0.2)

In [95]:
len(X_test), len(X_train), len(Y_Train), len(Y_test)

(61, 242, 242, 61)

In [96]:
clf.fit(X_train, Y_Train)

RandomForestClassifier()

# Make a prediction

y_label = clf.predict(np.array([0,2,3,4]))

In [97]:
y_preds = clf.predict(X_test)
y_preds

array([0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1], dtype=int64)

In [98]:
Y_test

170    0
201    0
80     1
141    1
51     1
      ..
166    0
202    0
172    0
84     1
68     1
Name: target, Length: 61, dtype: int64

In [99]:
# 4. Evaluate the model on the training data and test data
clf.score(X_train, Y_Train)

1.0

In [100]:
clf.score(X_test, Y_test)

0.8360655737704918

In [101]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


print(classification_report(Y_test, y_preds))

              precision    recall  f1-score   support

           0       0.73      0.80      0.76        20
           1       0.90      0.85      0.88        41

    accuracy                           0.84        61
   macro avg       0.81      0.83      0.82        61
weighted avg       0.84      0.84      0.84        61



In [102]:
confusion_matrix(Y_test, y_preds)

array([[16,  4],
       [ 6, 35]], dtype=int64)

In [103]:
accuracy_score(Y_test, y_preds)

0.8360655737704918

In [104]:
# 5. Improve the model
# Try different amount of n_estimators

np.random.seed(42)

for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(X_train, Y_Train)
    print(f"Model accuracy on test set: {clf.score(X_test, Y_test) * 100:.2f}%")
    print("")


Trying model with 10 estimators...
Model accuracy on test set: 70.49%

Trying model with 20 estimators...
Model accuracy on test set: 80.33%

Trying model with 30 estimators...
Model accuracy on test set: 78.69%

Trying model with 40 estimators...
Model accuracy on test set: 80.33%

Trying model with 50 estimators...
Model accuracy on test set: 80.33%

Trying model with 60 estimators...
Model accuracy on test set: 80.33%

Trying model with 70 estimators...
Model accuracy on test set: 78.69%

Trying model with 80 estimators...
Model accuracy on test set: 80.33%

Trying model with 90 estimators...
Model accuracy on test set: 78.69%



In [105]:
#6. Save a model and load it
import pickle

pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [106]:
loaded_clf = pickle.load((open("random_forest_model_1.pkl", "rb")))
loaded_clf.score(X_test, Y_test)

0.7868852459016393

In [107]:
adjusted_model = RandomForestClassifier(n_estimators=60)
adjusted_model.fit(X_train, Y_Train)

adjusted_model.score(X_train, Y_Train)

1.0

In [108]:
adjusted_model.score(X_test, Y_test)

0.8032786885245902

In [109]:
import sklearn
sklearn.show_versions()


System:
    python: 3.8.11 (default, Aug  6 2021, 09:57:55) [MSC v.1916 64 bit (AMD64)]
executable: C:\Projects\first-machine-learning-project\env\python.exe
   machine: Windows-10-10.0.19043-SP0

Python dependencies:
          pip: 21.2.2
   setuptools: 52.0.0.post20210125
      sklearn: 0.24.2
        numpy: 1.19.2
        scipy: 1.6.2
       Cython: None
       pandas: 1.2.3
   matplotlib: 3.3.4
       joblib: 1.0.1
threadpoolctl: 2.2.0

Built with OpenMP: True


In [110]:
sklearn.show_versions()


System:
    python: 3.8.11 (default, Aug  6 2021, 09:57:55) [MSC v.1916 64 bit (AMD64)]
executable: C:\Projects\first-machine-learning-project\env\python.exe
   machine: Windows-10-10.0.19043-SP0

Python dependencies:
          pip: 21.2.2
   setuptools: 52.0.0.post20210125
      sklearn: 0.24.2
        numpy: 1.19.2
        scipy: 1.6.2
       Cython: None
       pandas: 1.2.3
   matplotlib: 3.3.4
       joblib: 1.0.1
threadpoolctl: 2.2.0

Built with OpenMP: True


In [111]:
what_we_are_covering

['0. An end-to-end Scikit-Learn Workflow',
 '1. Getting the data ready',
 '2. Choose the right estimator (model, algorithm)',
 '3. Fit the model/estimator/algorithm and use it to make predictions on the data',
 '4. Evaluating a model',
 '5. Improve a model',
 '6. Save and load a trained model',
 '7. Put it all together']

# 1. Getting the Data Ready to be used with machine learning


Three main things we have to do to get the data ready are:

    1. Split the data into features and lables (usually known as `X` & `y`)
    2. Filling (also called imputing) or disregarding missing values
    3. Converting non-numeric values to numeric values (also called Feature Encoding)

In [112]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [113]:
X = heart_disease.drop("target", axis = 1);
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [114]:
y = heart_disease["target"];
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [115]:
len(X), len(y)

(303, 303)

In [116]:
# Split the Data into Training and Test Sets... 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [117]:
X_train.shape, X_test.shape, y_test.shape, y_test.shape

((151, 13), (152, 13), (152,), (152,))

In [118]:
X.shape, heart_disease.shape

((303, 13), (303, 14))

In [119]:
car_sales = pd.read_csv("data/car-sales.csv")

In [120]:
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [121]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


In [122]:
X = car_sales.drop("Price", axis = 1)

In [123]:
y = car_sales["Price"]

In [124]:
X, y

(     Make Colour  Odometer (KM)  Doors
 0  Toyota  White         150043      4
 1   Honda    Red          87899      4
 2  Toyota   Blue          32549      3
 3     BMW  Black          11179      5
 4  Nissan  White         213095      4
 5  Toyota  Green          99213      4
 6   Honda   Blue          45698      4
 7   Honda   Blue          54738      4
 8  Toyota  White          60000      4
 9  Nissan  White          31600      4,
 0     $4,000.00
 1     $5,000.00
 2     $7,000.00
 3    $22,000.00
 4     $3,500.00
 5     $4,500.00
 6     $7,500.00
 7     $7,000.00
 8     $6,250.00
 9     $9,700.00
 Name: Price, dtype: object)

In [125]:
X.shape, y.shape, car_sales.shape

((10, 4), (10,), (10, 5))

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [127]:
X_train, X_test

(     Make Colour  Odometer (KM)  Doors
 1   Honda    Red          87899      4
 6   Honda   Blue          45698      4
 9  Nissan  White          31600      4
 2  Toyota   Blue          32549      3
 7   Honda   Blue          54738      4
 5  Toyota  Green          99213      4
 4  Nissan  White         213095      4
 0  Toyota  White         150043      4,
      Make Colour  Odometer (KM)  Doors
 8  Toyota  White          60000      4
 3     BMW  Black          11179      5)

In [128]:
X_train.shape, X_test.shape

((8, 4), (2, 4))

In [129]:
#clf.fit(X_train, y_train)

In [130]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder();
print(label_encoder.fit_transform(car_sales["Make"]))

[3 1 3 0 2 3 1 1 3 2]


In [131]:
print(label_encoder.fit_transform(car_sales["Colour"]))

[4 3 1 0 4 2 1 1 4 4]


# About to learn how to convert data to numbers

# 1.1 Make sure all the Data is all numerical

In [132]:
car_sales = pd.read_csv("data/car-sales-extended.csv")
car_sales.head(), len(car_sales)

(     Make Colour  Odometer (KM)  Doors  Price
 0   Honda  White          35431      4  15323
 1     BMW   Blue         192714      5  19943
 2   Honda  White          84714      4  28343
 3  Toyota  White         154365      4  13434
 4  Nissan   Blue         181577      3  14043,
 1000)

In [133]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [134]:
# Split the data in X & y (Features & Labels)
X = car_sales.drop("Price", axis = 1)
y = car_sales["Price"]

In [135]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [136]:
# Split into traing and test data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [137]:
len(X_train), len(X_test)

(800, 200)

In [138]:
# Build a Machine Learning Model
#from sklearn.ensemble import RandomForestRegressor

#model = RandomForestRegressor()
#model.fit(X_train, y_train)
#model.score(X_test, y_test)

In [139]:
car_sales["Doors"].value_counts()

4    856
5     79
3     65
Name: Doors, dtype: int64

In [140]:
# Turn the Features/Categories into Numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]

#Init the Transformers & Encoders
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [141]:
df = pd.DataFrame(transformed_X)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [142]:
X

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
...,...,...,...,...
995,Toyota,Black,35820,4
996,Nissan,White,155144,3
997,Nissan,Blue,66604,4
998,Honda,White,215883,4


In [143]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [144]:
# Let's refit the model

np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

model.fit(X_train, y_train)


RandomForestRegressor()

In [145]:
model.score(X_test, y_test)

0.3235867221569877

# 1.2 -> What if there were missing values 


1. Fill them with some values (also known as imputation)
2. Remove the Samples with missing data

In [146]:
# Import Missing Data

car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv");
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [147]:
# Find the Number of missing values in a Dataframe
car_sales_missing.isna().sum(), car_sales.isna().sum()

(Make             49
 Colour           50
 Odometer (KM)    50
 Doors            50
 Price            50
 dtype: int64,
 Make             0
 Colour           0
 Odometer (KM)    0
 Doors            0
 Price            0
 dtype: int64)

In [148]:
# Transfrom the data into X and y
X_missing = car_sales_missing.drop("Price", axis = 1)
y_missing = car_sales_missing["Price"]

X_missing.isna().sum()


Make             49
Colour           50
Odometer (KM)    50
Doors            50
dtype: int64

In [149]:
# Let's try to convert the missing data to numbers

# Turn the Features/Categories into Numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]

#Init the Transformers & Encoders
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

transformed_X = transformer.fit_transform(X_missing)
transformed_X

<1000x16 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

In [150]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [153]:
transformed_X.dtype, X_missing.dtypes

(dtype('float64'),
 Make              object
 Colour            object
 Odometer (KM)    float64
 Doors            float64
 dtype: object)

# Fill the Missing Data with pandas

In [154]:
# Fill the "Make" column
car_sales_missing["Make"].fillna("missing", inplace= True)

# Fill the "Colour Column"
car_sales_missing["Colour"].fillna("missing", inplace=True)

# Fill the "Odometer" Column
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace= True)

# Fill the "Doors" column
car_sales_missing["Doors"].fillna(4, inplace= True)

In [155]:
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [156]:
car_sales_missing["Doors"].value_counts()

4.0    861
5.0     75
3.0     64
Name: Doors, dtype: int64

In [157]:
car_sales_missing["Make"].value_counts()

Toyota     379
Honda      292
Nissan     183
BMW         97
missing     49
Name: Make, dtype: int64

In [158]:
# Remove rows with missing data in the Labels
car_sales_missing.dropna(inplace = True)

In [159]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [160]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,missing,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [161]:
len(car_sales_missing)

950

In [162]:
# Get the Features & Labels of the Filled Missing Data frame

X = car_sales_missing.drop("Price", axis = 1)
y = car_sales_missing["Price"]

In [164]:
len(X), len(y)

(950, 950)

In [167]:
# Convert the missing data to numbers

# Turn the Features/Categories into Numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]

#Init the Transformers & Encoders
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

transformed_X = transformer.fit_transform(car_sales_missing)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

In [168]:
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

In [None]:
    # 2. Fill the Missing Data with the Scik-Kit Learn library