# Getting data ready to be used with machine learning steps

### 1. Split the data into features and labels ('X' & 'y')
### 2. Filling (also called imputing) or disregarding missing values
### 3. Converting non-numerical values to numerical values (also called feature encoding)



In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [13]:
file_path = r'B:\jupyter\Projects\Cheat Sheets\Sci Kit Learn Data\heart-disease.csv'
heart_disease = pd.read_csv(file_path)
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [14]:
X = heart_disease.drop("target",axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [15]:
y = heart_disease["target"]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

# Split data into training and testing sets (Never test your models on data that it has learned from)

In [16]:
from sklearn.model_selection import train_test_split
# Once we use the function train_test_split it will return the 4 values below once given the parameters of X,y and test_size
#Test size will be 20% of the overall data
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2)

In [17]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [18]:
# 80% is the training data , 20% is the testing data
X.shape[0]*0.8 , X.shape[0]*0.2

(242.4, 60.6)

# Converting Data to Numbers

In [19]:
file_path = r'B:\jupyter\Projects\Cheat Sheets\Sci Kit Learn Data\car-sales-extended.csv'
car_sales = pd.read_csv(file_path)
car_sales.head(10)


Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
5,Honda,Red,42652,4,23883
6,Toyota,Blue,163453,4,8473
7,Honda,White,43120,4,20306
8,Nissan,White,130538,4,9374
9,Honda,Blue,51029,4,26683


In [20]:
car_sales.info

<bound method DataFrame.info of        Make Colour  Odometer (KM)  Doors  Price
0     Honda  White          35431      4  15323
1       BMW   Blue         192714      5  19943
2     Honda  White          84714      4  28343
3    Toyota  White         154365      4  13434
4    Nissan   Blue         181577      3  14043
..      ...    ...            ...    ...    ...
995  Toyota  Black          35820      4  32042
996  Nissan  White         155144      3   5716
997  Nissan   Blue          66604      4  31570
998   Honda  White         215883      4   4001
999  Toyota   Blue         248360      4  12732

[1000 rows x 5 columns]>

In [21]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

## Trying to make machine learning with categorical data

In [24]:
# Split into X/y

X = car_sales.drop("Price",axis=1)
y = car_sales["Price"]

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
...,...,...,...,...
995,Toyota,Black,35820,4
996,Nissan,White,155144,3
997,Nissan,Blue,66604,4
998,Honda,White,215883,4


In [26]:
X,y

(       Make Colour  Odometer (KM)  Doors
 0     Honda  White          35431      4
 1       BMW   Blue         192714      5
 2     Honda  White          84714      4
 3    Toyota  White         154365      4
 4    Nissan   Blue         181577      3
 ..      ...    ...            ...    ...
 995  Toyota  Black          35820      4
 996  Nissan  White         155144      3
 997  Nissan   Blue          66604      4
 998   Honda  White         215883      4
 999  Toyota   Blue         248360      4
 
 [1000 rows x 4 columns],
 0      15323
 1      19943
 2      28343
 3      13434
 4      14043
        ...  
 995    32042
 996     5716
 997    31570
 998     4001
 999    12732
 Name: Price, Length: 1000, dtype: int64)

In [27]:
# Split into training and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [31]:
#Doors are still categorical even though it is a number
car_sales["Doors"].value_counts()

Doors
4    856
5     79
3     65
Name: count, dtype: int64

In [33]:
X.head(5)

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [47]:
# Turn the categories into numbers with the imported classes
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define the column names of the features
categorical_features = ["Make","Colour","Doors"]

# Cache the imported class and create a transformer object
one_hot = OneHotEncoder()

# Define a list of tuples specifying the transformer name, transformer object, and columns to be transformed
transformers = [("one_hot", one_hot, categorical_features)]

# Create the ColumnTransformer with the list of transformer specifications
transformer = ColumnTransformer(transformers, remainder="passthrough")

# Pass the dataframe to be transformed after the transformer has been set up
transformed_X = transformer.fit_transform(X)

print(f"Returned a {type(transformed_x)}")

# Showcase new transformed data
transformed_X[:5]


float64


array([[0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 8.47140e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 1.54365e+05],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.81577e+05]])

In [None]:
# Transform the returned transformed_x (np.)

In [28]:
# Build machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
# Feed in the training data so it would learn the patterns & relationships
model.fit(X_train,y_train)
model.score(X_test,y_test)

ValueError: could not convert string to float: 'Toyota'