In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from pandasql import sqldf

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [23]:
df_cars = pd.read_csv("./vw.csv")
df_cars.head(3)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,T-Roc,2019,25000,Automatic,13904,Diesel,145,49.6,2.0
1,T-Roc,2019,26883,Automatic,4562,Diesel,145,49.6,2.0
2,T-Roc,2019,20000,Manual,7414,Diesel,145,50.4,2.0


In [24]:
# Definimos la función para los "queries"
pysqldf = lambda q: sqldf(q, globals())

In [25]:
query = """SELECT model, transmission, fuelType FROM df_cars"""
dfCars = pysqldf(query)
dfCars.head(3)

Unnamed: 0,model,transmission,fuelType
0,T-Roc,Automatic,Diesel
1,T-Roc,Automatic,Diesel
2,T-Roc,Manual,Diesel


In [26]:
dfCars["modelEncode"] = dfCars.model.astype("category").cat.codes
dfCars["transmissionEncode"] = dfCars.transmission.astype("category").cat.codes
dfCars["fuelTypeEncode"] = dfCars.fuelType.astype("category").cat.codes

In [27]:
dfCars.sample(3)

Unnamed: 0,model,transmission,fuelType,modelEncode,transmissionEncode,fuelTypeEncode
5462,Golf,Manual,Diesel,12,1,0
5034,Golf,Automatic,Petrol,12,0,3
7522,Polo,Manual,Petrol,16,1,3


In [28]:
X = dfCars[["transmissionEncode", "fuelTypeEncode"]]
y = dfCars["modelEncode"]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
dtc_model = DecisionTreeClassifier(random_state = 0)
dtc_model.fit(X_train, y_train)

In [30]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15157 entries, 0 to 15156
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   transmissionEncode  15157 non-null  int8 
 1   fuelTypeEncode      15157 non-null  int8 
dtypes: int8(2)
memory usage: 29.7 KB


In [31]:
dtc_model.score(X_test, y_test)

0.39940633245382584

In [32]:
param = {"max_depth" : [2, 4, 6]}
dtc_model2 = DecisionTreeClassifier(random_state = 0)
grid_search = GridSearchCV(dtc_model2, param)
grid_search.fit(X_train, y_train)



In [33]:
grid_search.best_params_

{'max_depth': 4}

In [34]:
param = {"n_neighbors" : [5, 10, 15]}
dtc_model3 = KNeighborsClassifier()
grid_search = GridSearchCV(dtc_model3, param)
grid_search.fit(X_train, y_train)



In [35]:
grid_search.best_params_

{'n_neighbors': 15}

In [36]:
param = {"kernel" : ["linear", "poly", "rbf"]}
dtc_model4 = SVC()
grid_search = GridSearchCV(dtc_model4, param)
grid_search.fit(X_train, y_train)



In [37]:
grid_search.best_params_

{'kernel': 'rbf'}