# Hurray !! Scikit-Learn In VS Code!! This Dark Mode, these features, extensions... I just love it :))

In [3]:
# Importing all necessary libraries at one go
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
heart_disease = pd.read_csv('data/heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


## Getting our data ready to be used with Machine Learning Model

### 1. Three main things we have to do:

    1. Split the data into features and labels (usually `X` & `y`).    
    2. Filing (also called imputing) or disregarding missing values
    3. Converting non-numerical values into numerical values (also called feature encoding)

In [3]:
# Split the data into features & labels (say 'X' & 'y')

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

In [4]:
# split the data into training and test data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestClassifier()

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8524590163934426

### What if we have non-numerical value ?
#### Machine Learning Model does not work on Non-Numerical Values. So we need to make them all Numeric. 
Scikit-Learn comes with a module called `OneHotEncoder` that handles non-numerical values..
Let's have a look.
* Note: Here we will import another dataset for this as our previous dataset i.e. heart_disease dataset doesn't have non-numerical values to try our hands.. So Let's Dive In

In [5]:
car_sales = pd.read_csv('data/car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [6]:
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

In [7]:
# Let's turn categories (Make, Colour & Doors) to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()

transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")
transformed_X = transformer.fit_transform(X)
# pd.DataFrame(transformed_X)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

model = RandomForestRegressor()

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.3235867221569877

We got so low score because predicting price of cars by their colour, or number of doors does not makes sense.

### What if we have missing values
We can do two things. Either we can fill with some values or completely remove them from dataset.

In [4]:
# import dataset with missing values
car_sales_missing = pd.read_csv('data/car-sales-extended-missing-data.csv')
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [5]:
# Let's see how many missing values are there
car_sales_missing.isnull().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [11]:
# At first, drop labels with missing values i.e. 'Price'
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [12]:
from sklearn.model_selection import train_test_split    
# Split into X and y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

# Split into train and test data
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [13]:
# Fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' & numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_feature = ["Odometer (KM)"]

# Create an Imputer (An Imputer is something that fills up data)
imputer = ColumnTransformer([
    ('cat_imputer', cat_imputer, cat_features),
    ('door_imputer', door_imputer, door_feature),
    ('num_imputer', num_imputer, num_feature)
])

# Fill train and test values SEPARATELY
filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.fit_transform(X_test)

In [14]:
# Now get this newly edited data into Dataframe
car_sales_filled_train = pd.DataFrame(filled_X_train, columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test, columns=["Make", "Colour", "Doors", "Odometer (KM)"])


In [15]:
# Now we have to do OneHot Encoding (copy the same code as above)

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()

transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)])

transformed_X_train = transformer.fit_transform(car_sales_filled_train)
transformed_X_test = transformer.fit_transform(car_sales_filled_test)

In [16]:
# Let's finally fit the data
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)

model = RandomForestRegressor()

model.fit(transformed_X_train, y_train)
model.score(transformed_X_test, y_test)

0.08787238417429777

## Choosing the right estimator/algorithm for your problem
Key Points:
* Sklearn refers to machine learning models, algorithms as `estimators`.
* `Classification` - predicting a category (whether this or that,   whether patient has heart disease or not).
* `Regression` - predicting a number (selling price of car).
 
* Sklearn Model Map - https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### Let's learn how to pick a machine learning model for a Regression Problem
Here we will use California Housing Dataset - https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html

In [17]:
# Get California Housing Dataset in your notebook
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [21]:
# Add this weird looking data in pandas dataframe, so that we can visualise it properly

# Adding data
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])

# Adding target column
housing_df["target"] = housing["target"]
housing_df.head()

housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [22]:
# Import algorithm/estimator
# we found this algorithm in model map. There are several other model, so we'll try different models & check which model is providing us better results.

from sklearn.linear_model import Ridge

np.random.seed(42)

# create the data
X = housing_df.drop("target", axis=1)
y = housing_df["target"]

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate and fit the model on training set
model = Ridge()
model.fit(X_train, y_train)

# let's check the score of this model
model.score(X_test, y_test)

0.5758549611440128

So, `Ridge` model give us quite bad score i.e. 57%. As I said earlier, lets try out different model from that map.
Here I am trying `ensemble` model. Ensemble is combination of smaller models to try and make better predictions than just a single model. 
Let's see how this model performs

In [24]:
# import the model and its RandomForestRegressor module
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop("target", axis=1)
y = housing_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8066196804802649

Wohooo! Its 80.6%. Much higher than just 57%. You can't predict which model is best before applying them. So keep applying models and check what is best for you.

### Picking Machine Learning Model for a Classification Problem

In [26]:
# Get heart disease dataset
heart_disease = pd.read_csv('data/heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


After going through sklearn model map, we came through `LinearSVC` model. Let's jump into it

In [32]:
#  import the LinearSVC class
from sklearn.svm import LinearSVC

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = LinearSVC(max_iter=10000)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)



0.8688524590163934