# Intrudoction to Scikit-Learn
0. An end to end scikit learn workflow
1. Getting the data ready
2. Choose the right estimator/algorithm for our problem

In [6]:
# 1. Get the data ready

import pandas as pd
import numpy as np
heart_disease=pd.read_csv("heart-disease.csv")

In [7]:
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [8]:
# Create X (Features matrix)
X=heart_disease.drop("target",axis=1)

# Create y (Label matrix)
Y=heart_disease["target"]


In [9]:
#2. Choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)

# We will keep the default hyperparameters
clf.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [10]:
#3. Fit the model to the training  data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test=train_test_split(X,Y, test_size=0.2)


In [11]:
clf.fit(X_train,Y_train);

In [12]:
# Make a prediction
y_preds=clf.predict(X_test)
y_preds



array([1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [45]:
Y_test

134    1
65     1
19     1
46     1
167    0
      ..
281    0
245    0
24     1
177    0
32     1
Name: target, Length: 61, dtype: int64

In [46]:
# 4. Evaluate the model on the training data and test data
clf.score(X_train,Y_train)

1.0

In [47]:
clf.score(X_test,Y_test)

0.8852459016393442

In [48]:
from sklearn.metrics import classification_report , confusion_matrix, accuracy_score
print(classification_report(Y_test,y_preds))


              precision    recall  f1-score   support

           0       0.89      0.83      0.86        29
           1       0.85      0.91      0.88        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [49]:
confusion_matrix(Y_test,y_preds)

array([[24,  5],
       [ 3, 29]], dtype=int64)

In [50]:
accuracy_score(Y_test,y_preds)

0.8688524590163934

In [51]:
# 5. Improve model
# try different amount of n_estimators
np.random.seed(42)
for i in range (10,100,10):
    print(f"Trying model with {i} estimators...")
    clf=RandomForestClassifier(n_estimators=i).fit(X_train,Y_train)
    print(f"Model accuracy on test {clf.score(X_test,Y_test)}")
    print("")

Trying model with 10 estimators...
Model accuracy on test 0.7868852459016393

Trying model with 20 estimators...
Model accuracy on test 0.8852459016393442

Trying model with 30 estimators...
Model accuracy on test 0.7868852459016393

Trying model with 40 estimators...
Model accuracy on test 0.8852459016393442

Trying model with 50 estimators...
Model accuracy on test 0.8688524590163934

Trying model with 60 estimators...
Model accuracy on test 0.8524590163934426

Trying model with 70 estimators...
Model accuracy on test 0.9016393442622951

Trying model with 80 estimators...
Model accuracy on test 0.8852459016393442

Trying model with 90 estimators...
Model accuracy on test 0.8852459016393442



In [52]:
#6. Save the model and load it
import pickle
pickle.dump(clf,open("random_forest_model_1.pkl","wb"))

In [53]:
loaded_model=pickle.load(open("random_forest_model_1.pkl","rb"))

In [54]:
loaded_model.score(X_test,Y_test)

0.8852459016393442

# getting data ready to be used with machine learning

 Three main things we have to do:

1. split the data into feature and labels (usually 'X' and 'Y')
2. Filling (also called imputing) or disregarding missing values
3. converting nun numeric values to numeric values (feature encoding)
    

In [55]:
heart_disease.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [56]:
X=heart_disease.drop("target",axis=1)
Y=heart_disease["target"]

In [57]:
# split the data to training and test sets
from sklearn.model_selection import train_test_split
xx_train,xx_test,yy_train,yy_test=train_test_split(X,Y,test_size=0.2)


In [58]:
xx_train.shape,xx_test.shape, yy_train.shape,yy_test.shape

((242, 13), (61, 13), (242,), (61,))

# Make sure all data are numeric

In [59]:
car_sales=pd.read_csv("car-sales-extended.csv")
car_sales.head()


Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [60]:
len(car_sales)

1000

In [61]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [62]:
# Split into x/y
X3=car_sales.drop("Price", axis=1)
Y3=car_sales["Price"]

x3_train,x3_test,y3_train,y3_test=train_test_split(X3,Y3,test_size=0.2)

In [63]:
# build machine learning model
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()
model.fit(x3_train,y3_train)
model.score(x3_train,y3_train)

ValueError: could not convert string to float: 'Toyota'

In [64]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [65]:
# Turn the sections like name of the car and the color to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features=["Make","Colour","Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot", one_hot, categorical_features)],remainder="passthrough")
transformed_x=transformer.fit_transform(X)
transformed_x




ValueError: A given column is not a column of the dataframe

In [66]:
pd.DataFrame(transformed_x)

NameError: name 'transformed_x' is not defined

In [67]:
dummies=pd.get_dummies(car_sales[["Make","Colour","Doors"]])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [68]:
# Refit the model
np.random.seed(42)
x3_train,x3_test,y3_train,y3_test = train_test_split(transformed_x,Y3, test_size=0.2)
model.fit(x3_train,y3_train)

NameError: name 'transformed_x' is not defined

In [69]:
model.score(x3_test,y3_test)

ValueError: could not convert string to float: 'Honda'

# What if there is missing value
1. fill them with some value (also known as imputation).
2. Remove the samples with missing fata

In [70]:
car_sales_missing_extended=pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing_extended

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [71]:
car_sales_missing_extended.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [72]:
X4=car_sales_missing_extended.drop("Price", axis=1)
Y4=car_sales_missing_extended["Price"]


# convert our data to numbers
# Turn the sections like name of the car and the color to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features=["Make","Colour","Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot", one_hot, categorical_features)],remainder="passthrough")
transformed_x4=transformer.fit_transform(car_sales_missing_extended)
transformed_x4

<1000x17 sparse matrix of type '<class 'numpy.float64'>'
	with 5000 stored elements in Compressed Sparse Row format>

In [73]:
# Option1: fill missing data with Pandas
car_sales_missing_extended["Make"].fillna("missing",inplace=True)
car_sales_missing_extended["Colour"].fillna("missing",inplace=True)
car_sales_missing_extended["Odometer (KM)"].fillna(car_sales_missing_extended["Odometer (KM)"].mean(),inplace=True)
car_sales_missing_extended["Doors"].fillna(4,inplace=True) # majority of cars have 4 doors

In [74]:
car_sales_missing_extended.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [75]:
# Remove rows with missing price values
car_sales_missing_extended.dropna(inplace=True)

In [76]:
car_sales_missing_extended.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [77]:
X5=car_sales_missing_extended.drop("Price", axis=1)
Y5=car_sales_missing_extended["Price"]


# convert our data to numbers
# Turn the sections like name of the car and the color to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features=["Make","Colour","Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot", one_hot, categorical_features)],remainder="passthrough")
transformed_x5=transformer.fit_transform(car_sales_missing_extended)
transformed_x5

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

In [78]:
pd.DataFrame(transformed_x5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,35431.0,15323.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0,19943.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,84714.0,28343.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,154365.0,13434.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0,14043.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0,32042.0
946,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,155144.0,5716.0
947,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0,31570.0
948,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,215883.0,4001.0


# Fill missing data with Scikit Learn

In [79]:
car_sales_missing_extended=pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing_extended


Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [80]:

car_sales_missing_extended.dropna(subset=["Price"],inplace=True)
car_sales_missing_extended.isna().sum()


Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [81]:
#split into x and y
x=car_sales_missing_extended.drop("Price",axis=1)
y=car_sales_missing_extended["Price"]

In [82]:
#Fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#fill categorical values with "missing and numerical values with mean"
# cat===cathegorical
cat_imputer=SimpleImputer(strategy="constant",fill_value="missing")
door_imputer=SimpleImputer(strategy="constant",fill_value=4)
num_imputer=SimpleImputer(strategy="mean")

#Define columns
cat_features=["Make","Colour"]
door_feature=["Doors"]
num_features=["Odometer (KM)"]

# Create an imputer (something that fills missing data)
imputer=ColumnTransformer([
    ("cat_imputerrr",cat_imputer,cat_features),
    ("door_imputerrr",door_imputer,door_feature),
    ("num_imputerrr",num_imputer,num_features)
])

#Transform data

filled_x=imputer.fit_transform(x)
filled_x


array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [83]:
car_sales_filled_values=pd.DataFrame(filled_x,columns=["Make","Colour","Doors","Odometer (KM)"])
car_sales_filled_values.head()

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0


In [84]:
car_sales_filled_values.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [85]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features=["Make","Colour","Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot", one_hot, categorical_features)],remainder="passthrough")
transformed_x=transformer.fit_transform(car_sales_filled_values)
transformed_x

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [87]:
# Now we have got our data as number and filled and lets fit the model

np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test=train_test_split(transformed_x,y,test_size=0.2)
model=RandomForestRegressor()
model.fit(x_train,y_train)
model.score(x_test,y_test)

# Prediction 
model.predict(x_test)

array([17256.07      , 20654.13      , 12458.45      ,  9437.53      ,
       11143.27      , 11170.52      , 15647.07      , 10337.8       ,
       17105.        , 15316.12995349,  8415.59      , 14374.12      ,
        8551.6       ,  9911.55      , 13487.02      , 19700.26      ,
       15229.49      ,  7452.8       , 11179.56      , 14656.62      ,
       11037.16      , 17882.96916667, 19921.62      , 27286.56      ,
        8906.46      , 21191.73      , 12903.89      ,  7937.72      ,
       20344.09      , 17089.77      , 11372.4       , 17132.11      ,
       10720.56      , 11315.72119048, 27704.28      , 15184.59      ,
       12212.17      , 13520.17      , 21698.51      ,  9472.67      ,
       15669.16      , 20707.49      , 25363.61      , 15490.39      ,
       14009.52857143, 11602.37      , 14853.96      ,  8408.09      ,
       15397.05      , 13027.37      , 11249.86      , 21260.71      ,
       14986.42      ,  5816.66      , 12175.37      ,  9617.15      ,
      