In [1]:
import pandas as pd

uri_dados = 'https://gist.githubusercontent.com/guilhermesilveira/e99a526b2e7ccc6c3b70f53db43a87d2/raw/1605fc74aa778066bf2e6695e24d53cf65f2f447/machine-learning-carros-simulacao.csv'
dados = pd.read_csv(uri_dados)
dados

Unnamed: 0.1,Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano
0,0,30941.02,1,18,35085.22134
1,1,40557.96,1,20,12622.05362
2,2,89627.50,0,12,11440.79806
3,3,95276.14,0,3,43167.32682
4,4,117384.68,1,4,12770.11290
...,...,...,...,...,...
9995,9995,97112.86,0,12,25060.64248
9996,9996,107424.63,1,16,21317.31764
9997,9997,93856.99,0,4,20950.38812
9998,9998,51250.57,1,7,16840.13376


In [2]:
# remover coluna inutil
dados = dados.drop(columns=['Unnamed: 0'])
dados

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano
0,30941.02,1,18,35085.22134
1,40557.96,1,20,12622.05362
2,89627.50,0,12,11440.79806
3,95276.14,0,3,43167.32682
4,117384.68,1,4,12770.11290
...,...,...,...,...
9995,97112.86,0,12,25060.64248
9996,107424.63,1,16,21317.31764
9997,93856.99,0,4,20950.38812
9998,51250.57,1,7,16840.13376


In [3]:
# Realizar separação em dados de treino e teste
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# features na variável x e target na variavel y
x = dados[['preco', 'idade_do_modelo', 'km_por_ano']]
y = dados['vendido']

# set SEED
np.random.seed(158020)

# split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)
print(f"Treinaremos com {len(x_train)} elementos e testaremos com {len(x_test)} elementos.")

Treinaremos com 7500 elementos e testaremos com 2500 elementos.


In [4]:
# Visualizar valores de predição baseline com o DummyClassifier
# By default DummyClassifier is stratified
from sklearn.dummy import DummyClassifier

# instantiate the class
dummy = DummyClassifier()

# fit the data
dummy.fit(x_train, y_train)

# calculate the accuracy
accuracy = dummy.score(x_test, y_test)*100
print(f"Dummy classifier accuracy: {accuracy:.2f}%")

Dummy classifier accuracy: 58.00%


Fit a DecisionTreeClassifier model

In [5]:
from sklearn.tree import DecisionTreeClassifier

# set the random seed
np.random.seed(158020)

# instantiate the model
tree_model = DecisionTreeClassifier(max_depth=2)

# fit the model
tree_model.fit(x_train, y_train)

# make predictions
y_predictions = tree_model.predict(x_test)

# calculate accuracy
accuracy = accuracy_score(y_test, y_predictions)*100
print(f"The accuracy was: {accuracy:.2f}%")

The accuracy was: 71.92%


The random seed can influence the final accuracy.

Let's run the DecisionTreeClassifier again with a different seed, to see if the accuracy changes.

In [50]:
# features na variável x e target na variavel y
x = dados[['preco', 'idade_do_modelo', 'km_por_ano']]
y = dados['vendido']
# set SEED
np.random.seed(5)
# split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)
print(f"Treinaremos com {len(x_train)} elementos e testaremos com {len(x_test)} elementos.")

# instantiate the model
tree_model = DecisionTreeClassifier(max_depth=2)
# fit the model
tree_model.fit(x_train, y_train)
# make predictions
y_predictions = tree_model.predict(x_test)

# calculate accuracy
accuracy = accuracy_score(y_test, y_predictions)*100
print(f"The accuracy was: {accuracy:.2f}%")

Treinaremos com 7500 elementos e testaremos com 2500 elementos.
The accuracy was: 76.84%


As we can see, with a different seed the accuracy was also different.

To get around this, we can calculate an interval of accuracy.

In [51]:
x, y

(          preco  idade_do_modelo   km_por_ano
 0      30941.02               18  35085.22134
 1      40557.96               20  12622.05362
 2      89627.50               12  11440.79806
 3      95276.14                3  43167.32682
 4     117384.68                4  12770.11290
 ...         ...              ...          ...
 9995   97112.86               12  25060.64248
 9996  107424.63               16  21317.31764
 9997   93856.99                4  20950.38812
 9998   51250.57                7  16840.13376
 9999   87945.73               19  19894.66108
 
 [10000 rows x 3 columns],
 0       1
 1       1
 2       0
 3       0
 4       1
        ..
 9995    0
 9996    1
 9997    0
 9998    1
 9999    0
 Name: vendido, Length: 10000, dtype: int64)

# Cross validation with K-fold

In [52]:
# Let's use the cross-validation
from sklearn.model_selection import cross_validate

# set seed
np.random.seed(42)

# instantiate the model
tree_model = DecisionTreeClassifier(max_depth=2)

# make cross-validation
# cv: how many divisions to cross-validate
cv_results = cross_validate(tree_model, x, y, cv=3)
# scores for each of the tests
print(cv_results['test_score'])
# the separation of the dataset has occured 3 times
# mean of the test scores
mean = cv_results['test_score'].mean()
print(mean)

# 95% of density of occurences will be between the mean +/- 2 times the standard deviations
standard_deviation = cv_results['test_score'].std()
print(standard_deviation)
print(f"Accuracy with cross-validation of 3 parts: [{(mean-2*standard_deviation)*100:.2f}, {mean*100:.2f}, {(mean+2*standard_deviation)*100:.2f}]")

[0.75704859 0.7629763  0.75337534]
0.7578000751484867
0.003955431356145979
Accuracy with cross-validation of 3 parts: [74.99, 75.78, 76.57]


In [53]:
# now let's try a cv of 10
np.random.seed(42)
tree_model = DecisionTreeClassifier(max_depth=2)
cv_results = cross_validate(tree_model, x, y, cv=10)
mean = cv_results['test_score'].mean()
standard_deviation = cv_results['test_score'].std()
print(f"Accuracy with cross-validation of 10 parts: [{(mean-2*standard_deviation)*100:.2f}, {mean*100:.2f}, {(mean+2*standard_deviation)*100:.2f}]")
# with a different cv value the confidence interval changed
# by convention a cv value between 5 and 10 shoul be a good choice

Accuracy with cross-validation of 10 parts: [74.24, 75.78, 77.32]


In [54]:
# Let's make a function to print the mean and the confidence interval
def print_results(results):
    mean = results['test_score'].mean()
    standard_deviation = results['test_score'].std()
    print(f"Accuracy mean and +- 2 std: [{(mean-2*standard_deviation)*100:.2f}, {mean*100:.2f}, {(mean+2*standard_deviation)*100:.2f}]")

Kfold with aleatorization

In [55]:
# cross validation doen't shuffle the data
# but the kFold object gives the possibility to shuffle
from sklearn.model_selection import KFold

cv = KFold(n_splits=10)
np.random.seed(42)
tree_model = DecisionTreeClassifier(max_depth=2)
cv_results = cross_validate(tree_model, x, y, cv=cv)
print_results(cv_results)
# this execution is without shuffle

Accuracy mean and +- 2 std: [74.37, 75.78, 77.19]


In [56]:
# this execution is with suffle
# it is good to use shuffle when the data doesn't have a sequencial structure or dependency, like time-series
cv = KFold(n_splits=10, shuffle=True)
np.random.seed(42)
tree_model = DecisionTreeClassifier(max_depth=2)
cv_results = cross_validate(tree_model, x, y, cv=cv)
print_results(cv_results)
# we can see that the confidence interval has expanded

Accuracy mean and +- 2 std: [72.76, 75.76, 78.76]


# Stratification with cross-validation

In [57]:
# we don't have a stratify parameter in KFold
# without stratification we can have bad luck situations for the data separation
# that is an inbalance in the classes propostion
# let's simulate an inbalance cenario
bad_luck_data = dados.sort_values('vendido', ascending=True)
x_bl  = bad_luck_data[['preco', 'idade_do_modelo', 'km_por_ano']]  # lots of zeros here
y_bl = bad_luck_data['vendido']  # lots of ones here

In [58]:
# let's run without shuffle
from sklearn.model_selection import KFold

np.random.seed(301)
cv = KFold(n_splits=10)
tree_model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(tree_model, x_bl, y_bl, cv=cv)
print_results(results)
# it results in a pretty bad mean an confidence interval

Accuracy mean and +- 2 std: [34.29, 57.84, 81.39]


In [59]:
# now with shuffle
np.random.seed(301)
cv = KFold(n_splits=10, shuffle=True)
tree_model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(tree_model, x_bl, y_bl, cv=cv)
print_results(results)
# with shuffle over the data we have, again, a good result

Accuracy mean and +- 2 std: [72.30, 75.78, 79.26]


In [60]:
# now an even better way, a kFold with stratification
# this way the folds are made by preserving the percentage of samples for each class
from sklearn.model_selection import StratifiedKFold
np.random.seed(301)
cv = StratifiedKFold(n_splits=10, shuffle=True)
tree_model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(tree_model, x_bl, y_bl, cv=cv)
print_results(results)
# we had a slightly better confidence interval

Accuracy mean and +- 2 std: [73.55, 75.78, 78.01]


In [61]:
# let's create a new column of data for the vehicle model, based on the actual data of the dataset
# this new column will be based on the age of the vehicle
print(dados.idade_do_modelo) # column that we will use
print(f"lenght of data: {len(dados)}")
np.random.seed(301)
# generate a random value for each row of the dataset
dados['model'] = dados.idade_do_modelo + np.random.randint(-2, 3, len(dados))

0       18
1       20
2       12
3        3
4        4
        ..
9995    12
9996    16
9997     4
9998     7
9999    19
Name: idade_do_modelo, Length: 10000, dtype: int64
lenght of data: 10000


In [62]:
dados

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano,model
0,30941.02,1,18,35085.22134,16
1,40557.96,1,20,12622.05362,22
2,89627.50,0,12,11440.79806,12
3,95276.14,0,3,43167.32682,4
4,117384.68,1,4,12770.11290,3
...,...,...,...,...,...
9995,97112.86,0,12,25060.64248,12
9996,107424.63,1,16,21317.31764,15
9997,93856.99,0,4,20950.38812,4
9998,51250.57,1,7,16840.13376,9


In [63]:
# let's see how many unique models do we have
dados.model.unique()

array([16, 22, 12,  4,  3, 11, 18, 17, 13,  0, 15, 10,  9, 14,  1,  5, 19,
       21,  8,  7, 20,  6,  2, -1], dtype=int64)

In [64]:
abs(dados.model.min())

1

In [65]:
# now let's eliminate negative and zero values on the entire column
dados.model = dados.model + abs(dados.model.min()) + 1
dados.model

0       18
1       24
2       14
3        6
4        5
        ..
9995    14
9996    17
9997     6
9998    11
9999    23
Name: model, Length: 10000, dtype: int64

In [66]:
dados.model.min()
# now our minimum value for the model is one

1

In [67]:
dados

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano,model
0,30941.02,1,18,35085.22134,18
1,40557.96,1,20,12622.05362,24
2,89627.50,0,12,11440.79806,14
3,95276.14,0,3,43167.32682,6
4,117384.68,1,4,12770.11290,5
...,...,...,...,...,...
9995,97112.86,0,12,25060.64248,14
9996,107424.63,1,16,21317.31764,17
9997,93856.99,0,4,20950.38812,6
9998,51250.57,1,7,16840.13376,11


In [68]:
dados.model.value_counts()

model
20    901
19    798
18    771
21    723
17    709
16    668
14    621
22    575
15    573
13    557
12    511
11    401
10    371
23    370
9     336
8     278
7     206
24    199
6     181
5     108
4      76
3      44
2      17
1       6
Name: count, dtype: int64

In [69]:
# if we try to cross-validade our tree model now, the results won't be considering the event of new car models
# that is, all the models will be used in the training as well as in the testing
np.random.seed(301)
cv = StratifiedKFold(n_splits=10, shuffle=True)
tree_model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(tree_model, x_bl, y_bl, cv=cv)
print_results(results)
# so it wolud be good to separate some model to use only in the test, to see how the model will perform when it sees new models

Accuracy mean and +- 2 std: [73.55, 75.78, 78.01]


In [71]:
# let's use the group k-fold to reserve some car models for the test
from sklearn.model_selection import GroupKFold

np.random.seed(301)
# here we use GroupKFold rather then StratifiedKFold
cv = GroupKFold(n_splits=10)
tree_model = DecisionTreeClassifier(max_depth=2)
# and here in the crossvalidate we indicate witch column contains wour groups
results = cross_validate(tree_model, x_bl, y_bl, cv=cv, groups=dados.model)
print_results(results)

Accuracy mean and +- 2 std: [73.67, 75.78, 77.90]


# Pipelines

## Cross validation with standard scaler

In [77]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

np.random.seed(301)

scaler = StandardScaler()
scaler.fit(x_train)
# scalling the features
x_train_scl = scaler.transform(x_train)
x_test_scl = scaler.transform(x_test)

svc_model = SVC()
svc_model.fit(x_train_scl, y_train)
predictions = svc_model.predict(x_test_scl)

accuracy = accuracy_score(y_test, predictions) * 100
print(f"The accuracy was: {accuracy}")

The accuracy was: 77.48


In [78]:
from sklearn.model_selection import GroupKFold

np.random.seed(301)

cv = GroupKFold(n_splits=10)
svc_model = SVC()
results = cross_validate(svc_model, x_bl, y_bl, cv=cv, groups=dados.model)
print_results(results)
# here we have a result without scaling

Accuracy mean and +- 2 std: [74.35, 77.27, 80.20]


In [80]:
# let's scaler X bad luck
scaler = StandardScaler()
scaler.fit(x_bl)
x_bl_scl = scaler.transform(x_bl)
# here we scalled evething, but we shold scale only the x_train

In [81]:
# now, running SVC with scalling
np.random.seed(301)

cv = GroupKFold(n_splits=10)
svc_model = SVC()  # Support Vector Classification
results = cross_validate(svc_model, x_bl_scl, y_bl, cv=cv, groups=dados.model)
# to improve: for each fold we should do a scalling
print_results(results)

Accuracy mean and +- 2 std: [74.30, 76.70, 79.10]


In [84]:
# let's create a pipeline with a scalling step after each fold
from sklearn.pipeline import Pipeline

np.random.seed(301)

scaler = StandardScaler()
svc_model = SVC()  # Suport Vector Classification

pipeline = Pipeline([('transformation', scaler), ('estimator', svc_model)])
pipeline
# the pipeline works as if it was an estimator, it has a fit a predict methods

In [85]:
cv = GroupKFold(n_splits=10)
# we do the cross-validation process with the entire pipeline as if it was a common model
results = cross_validate(pipeline, x_bl, y_bl, cv=cv, groups=dados.model)
# for each fold in the cross-validation we do the steps of the pipeline, the data transformation, and after, the estimation
print_results(results)

Accuracy mean and +- 2 std: [74.28, 76.68, 79.08]


In [86]:
# final model fitting
svc_model.fit(x, y)

In [87]:
dados

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano,model
0,30941.02,1,18,35085.22134,18
1,40557.96,1,20,12622.05362,24
2,89627.50,0,12,11440.79806,14
3,95276.14,0,3,43167.32682,6
4,117384.68,1,4,12770.11290,5
...,...,...,...,...,...
9995,97112.86,0,12,25060.64248,14
9996,107424.63,1,16,21317.31764,17
9997,93856.99,0,4,20950.38812,6
9998,51250.57,1,7,16840.13376,11


In [94]:
# making prediction
data = [[30000.00, 19, 35000.00]]
columns = ['preco', 'idade_do_modelo', 'km_por_ano']
predict_test = pd.DataFrame(data, columns=columns)
# I expect that to predict as a car that will sell
svc_model.predict(predict_test)

array([1], dtype=int64)

In [102]:
data = [[60000.00, 40, 40000.00]]
columns = ['preco', 'idade_do_modelo', 'km_por_ano']
predict_test = pd.DataFrame(data, columns=columns)
# now I expect that it own't sell
svc_model.predict(predict_test)
# I don't liked the results very much, I had to put really big values on price, age and km per year for the model to predict as a not selling car

array([0], dtype=int64)