In [1]:
# General
import pandas as pd
import numpy as np
import warnings
import itertools
import pickle as pkl

from numcat import *
from outliers import *
from encoding import *

# Escalador
from sklearn.preprocessing import MinMaxScaler

# Clustering
from sklearn.cluster import KMeans

# Utilidades
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Modelo de regresión
from sklearn.ensemble import RandomForestRegressor

# Modelo de clasificación
from sklearn.neighbors import KNeighborsClassifier

# Métricas
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score


# Opciones globales
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('vehicles.csv')

In [3]:
# Creamos una nueva columna car_age que sea la resta de el año de posting_date con year
df['car_age'] = df['posting_date'].str[:4].astype(float) - df['year']
df = df[df['car_age'] >= 0].drop('posting_date', axis = 1)

In [4]:
# creamos una nueva columna categorizando el tipo de marca que tiene (lujo, premium...)
brand_segmentation = {
    'gmc': 'Mainstream',
    'chevrolet': 'Mainstream',
    'toyota': 'Mainstream',
    'ford': 'Mainstream',
    'jeep': 'Mainstream',
    'nissan': 'Mainstream',
    'ram': 'Mainstream',
    'mazda': 'Mainstream',
    'cadillac': 'Premium',
    'honda': 'Mainstream',
    'dodge': 'Mainstream',
    'lexus': 'Premium',
    'jaguar': 'Premium',
    'buick': 'Mainstream',
    'chrysler': 'Mainstream',
    'volvo': 'Premium',
    'audi': 'Premium',
    'infiniti': 'Premium',
    'lincoln': 'Premium',
    'alfa-romeo': 'Premium',
    'subaru': 'Mainstream',
    'acura': 'Premium',
    'hyundai': 'Mainstream',
    'mercedes-benz': 'Premium',
    'bmw': 'Premium',
    'mitsubishi': 'Mainstream',
    'volkswagen': 'Mainstream',
    'porsche': 'Premium',
    'kia': 'Mainstream',
    'rover': 'Mainstream',
    'ferrari': 'Luxury',
    'mini': 'Mainstream',
    'pontiac': 'Mainstream',
    'fiat': 'Mainstream',
    'tesla': 'Premium',
    'saturn': 'Mainstream',
    'mercury': 'Mainstream',
    'harley-davidson': 'Mainstream',
    'aston-martin': 'Luxury',
    'land rover': 'Premium',
    'morgan': 'Luxury'
}

df['brand_segmentation'] = df['manufacturer'].map(brand_segmentation)

In [5]:
df.drop(['id', 'url', 'region_url', 'model', 'VIN', 'size', 'image_url', 'description', 'county'], axis = 1, inplace = True)

In [6]:
df_num, df_cat = splitNumCat(df)

In [7]:
df_num = dropOutliers(df_num, getBestOutliersThreshold(df_num, 0.05))

In [8]:
df_ = mergeNumCat(df_num, df_cat)

In [9]:
df_num, df_cat = splitNumCat(df_)

In [10]:
df_cat, encodings = encode(df_cat, df_num['price'],'best', {})

In [11]:
df_ = mergeNumCat(df_num, df_cat)

In [12]:
df_1 = df_.dropna()
df_1 = df_1[df_1['price'] > 1]

In [13]:
df_1

Unnamed: 0,region,manufacturer,fuel,title_status,transmission,type,state,brand_segmentation,paint_color,price,year,odometer,lat,long,car_age
0,140,16331.0,339610.0,386565.0,61705.0,41694.0,4727,323539.0,20973.077727,33590,2014.0,57923.0,32.590000,-85.480000,7.0
1,140,51845.0,339610.0,386565.0,61705.0,41694.0,4727,323539.0,16086.209312,22590,2010.0,71229.0,32.590000,-85.480000,11.0
2,140,51845.0,339610.0,386565.0,61705.0,41694.0,4727,323539.0,18205.241879,39590,2020.0,19160.0,32.590000,-85.480000,1.0
3,140,33367.0,339610.0,386565.0,61705.0,41694.0,4727,323539.0,18205.241879,30990,2017.0,41124.0,32.590000,-85.480000,4.0
4,140,67325.0,339610.0,386565.0,321732.0,33829.0,4727,323539.0,20432.461186,15000,2013.0,128000.0,32.592000,-85.518900,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406689,585,10447.0,339610.0,386565.0,321732.0,20281.0,585,68952.0,20973.077727,30990,2018.0,15080.0,33.779214,-84.411811,3.0
406690,585,7820.0,339610.0,386565.0,321732.0,83668.0,585,68952.0,20973.077727,33590,2018.0,30814.0,33.779214,-84.411811,3.0
406692,585,3204.0,339610.0,386565.0,61705.0,83668.0,585,68952.0,18205.241879,30590,2020.0,12029.0,33.786500,-84.445400,1.0
406693,585,6705.0,28286.0,386565.0,61705.0,16430.0,585,68952.0,20973.077727,34990,2020.0,4174.0,33.779214,-84.411811,1.0


In [14]:
X = df_1.drop('price', axis = 1)
y = df_1[['price']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 43)

model = RandomForestRegressor(n_jobs = -1)

model.fit(X_train, y_train)
yhat = model.predict(X_test)

r2 = r2_score(y_test, yhat)

In [15]:
r2

0.9035645662824112

### Clustering

In [16]:
# Escalamos datos para KMeans

X = df_1.drop(['price'], axis = 1)

kmeans_scaler = MinMaxScaler()
X = kmeans_scaler.fit_transform(X)

In [17]:
# Generamos la mejor k mediante el teorema de Dmitry

def get_best_k(X, n_k, max_inertia, model = KMeans()):
    inertias = list()
    
    for k in n_k:
        model.n_clusters = k
        model.fit(X)
        inertias.append(model.inertia_)
        
    k_i = np.array([[k, i] for k, i in enumerate(inertias, start = 1)])

    k_i_max = np.max(k_i, axis=0)
    k_i_norm = k_i / k_i_max
    
    distances = [np.linalg.norm([point, [0,0]]) for point in k_i_norm]
    
    return pd.concat([pd.DataFrame([x for x in range(1,n_k[-1] + 1)], columns = ['K']), pd.DataFrame(distances, columns = ['Distance'])], axis = 1).sort_values('Distance')

best_k = get_best_k(X,range(1,11), 10).reset_index(drop = True)['K'].iloc[0]

In [18]:
# Entrenamos KMeans

kmeans = KMeans(n_clusters = best_k)
kmeans.fit(X)
labels = kmeans.labels_


# Concatenamos las etiquetas de clusters con el dataframe

df_cluster = df_1
df_cluster['cluster'] = labels

In [19]:
# Separamos en X e y para entrenar el clasificador

X = df_cluster.drop(['price', 'cluster'], axis = 1)
y = df_cluster[['cluster']]

# Separamos en train y test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 43)

In [20]:
# Entrenamos el clasificador

knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
yhat = knn.predict(X_test)

conf_matrix = confusion_matrix(y_test, yhat)
acc = accuracy_score(y_test, yhat)

print(conf_matrix)
print(acc)

[[ 7448     0     0     0]
 [    0 17800     0     7]
 [    0     0  8429     0]
 [    0     1     0 12608]]
0.999827187695764


In [23]:
def trainClusters(df_cluster, scaler, model, best_features = None, params = {}):
    
    scores = []
    models = []
    for i in df_cluster['cluster'].unique():
        data = df_cluster[df_cluster['cluster'] == i].drop('cluster', axis = 1)
        
        X = data.drop('price', axis = 1)
        
        X = scaler.inverse_transform(X)
        
        if best_features != None:
            X = pd.DataFrame(X, columns = data.drop('price', axis = 1).columns)[[x[0] for x in best_features[i]]]
        
        y = data['price']
        
        params = params
        
        grid_solver = GridSearchCV(estimator = model,
                                   param_grid = params,
                                   scoring = 'r2',
                                   cv = 5,
                                   refit = 'r2',
                                   n_jobs = -1,
                                  )
        model_result = grid_solver.fit(X, y)
        
        r2 = grid_solver.best_score_
        
        scores.append([data.shape[0], r2])
        models.append(model_result)
    return scores, models

In [24]:
%%time
clusters = trainClusters(df_cluster, kmeans_scaler, RandomForestRegressor())

CPU times: user 1min 23s, sys: 1.31 s, total: 1min 25s
Wall time: 2min 38s


In [25]:
# r2 con todas las columnas
sum([x[1]*x[0]/df_cluster.shape[0] for x in clusters[0]])*acc

0.7767213595186595

### brand_segmentation clustering

In [26]:
df_brand = df_1

df_brand['cluster'] = df_brand['brand_segmentation']
df_brand.drop('brand_segmentation', axis = 1, inplace = True)

X = df_brand.drop(['price', 'cluster'], axis = 1)
y = df_brand[['cluster']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 43)

In [27]:
from sklearn.ensemble import RandomForestClassifier
# Entrenamos el clasificador

rfc = RandomForestClassifier(n_jobs = -1)
rfc.fit(X_train, y_train)
yhat = rfc.predict(X_test)

conf_matrix = confusion_matrix(y_test, yhat)
acc = accuracy_score(y_test, yhat)

print(conf_matrix)
print(acc)

[[    2     1     4]
 [    0  8656   136]
 [    0    37 37457]]
0.9961549262307476


In [28]:
def trainClusters(df_cluster, scaler, model, best_features = None, params = {}):
    
    scores = []
    models = []
    for i in df_cluster['cluster'].unique():
        data = df_cluster[df_cluster['cluster'] == i].drop('cluster', axis = 1)
        
        X = data.drop('price', axis = 1)
        
        if best_features != None:
            X = pd.DataFrame(X, columns = data.drop('price', axis = 1).columns)[[x[0] for x in best_features[i]]]
        
        y = data['price']
        
        params = params
        
        grid_solver = GridSearchCV(estimator = model,
                                   param_grid = params,
                                   scoring = 'r2',
                                   cv = 5,
                                   refit = 'r2',
                                   n_jobs = -1,
                                  )
        model_result = grid_solver.fit(X, y)
        
        r2 = grid_solver.best_score_
        
        scores.append([data.shape[0], r2])
        models.append(model_result)
    return scores, models

In [29]:
%%time
clusters = trainClusters(df_brand, kmeans_scaler, RandomForestRegressor())

CPU times: user 1min 37s, sys: 195 ms, total: 1min 37s
Wall time: 3min 3s


In [30]:
sum([x[1]*x[0]/df_cluster.shape[0] for x in clusters[0]])*acc

0.824919709815228

### Approach selection
Nos quedamos con el modelo original puesto que es más eficiente y hacer clustering no mejora la métrica final

In [31]:
X = df_1.drop(['price'], axis = 1)
y = df_1[['price']]

In [33]:
%%time

params = {}

grid_solver = GridSearchCV(estimator = RandomForestRegressor(),
                           param_grid = params,
                           scoring = 'r2',
                           cv = 5,
                           refit = 'r2',
                           n_jobs = -1,
                           verbose = 3
                          )
model_result = grid_solver.fit(X, y['price'])

r2 = grid_solver.best_score_

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 2/5] END ..................................., score=0.806 total time= 1.5min
[CV 4/5] END ..................................., score=0.864 total time= 1.5min
[CV 3/5] END ..................................., score=0.846 total time= 1.5min
[CV 1/5] END ..................................., score=0.784 total time= 1.5min
[CV 5/5] END ..................................., score=0.859 total time= 1.5min
CPU times: user 1min 39s, sys: 320 ms, total: 1min 40s
Wall time: 3min 12s


In [34]:
r2

0.8319262196369202

### Reducción de dimensionalidad

In [35]:
model = RandomForestRegressor(**model_result.best_params_)
model.fit(X, y)

In [36]:
feature_importances = np.array([[X.columns[x], y ]for x, y in enumerate(model.feature_importances_)])

In [37]:
def getMostImportantColumns(feature_importances, keep):
    # Sort the feature importances in descending order
    sorted_importances = sorted(feature_importances, key=lambda x: float(x[1]), reverse=True)

    # Initialize the variables for keeping track of the sum and the columns to keep
    sum_importances = 0
    columns_to_keep = []

    # Iterate through the sorted importances and add them to the sum until the keep threshold is reached
    for feature, importance in sorted_importances:
        importance = float(importance)
        if sum_importances + importance <= keep:
            sum_importances += importance
            columns_to_keep.append(feature)
        else:
            break

    # Return the columns in the same order as the original feature_importances
    return [col for col in [f[0] for f in feature_importances] if col in columns_to_keep]

In [38]:
X = df_1[getMostImportantColumns(feature_importances, 0.9)]
y = df_1[['price']]

In [39]:
getMostImportantColumns(feature_importances, 0.9)

['manufacturer', 'fuel', 'type', 'year', 'odometer', 'long', 'car_age']

In [40]:
%%time

params = {}

grid_solver = GridSearchCV(estimator = RandomForestRegressor(),
                           param_grid = params,
                           scoring = 'r2',
                           cv = 5,
                           refit = 'r2',
                           n_jobs = -1,
                           verbose = 3
                          )
model_result = grid_solver.fit(X, y['price'])

r2 = grid_solver.best_score_

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 2/5] END ..................................., score=0.814 total time=  50.1s
[CV 3/5] END ..................................., score=0.840 total time=  50.1s
[CV 1/5] END ..................................., score=0.753 total time=  51.1s
[CV 4/5] END ..................................., score=0.857 total time=  52.0s
[CV 5/5] END ..................................., score=0.847 total time=  54.4s
CPU times: user 52.8 s, sys: 1.18 s, total: 54 s
Wall time: 1min 49s


In [41]:
r2

0.8223594801908496

### Guardado de modelo

In [42]:
model = RandomForestClassifier(**model_result.best_params_)

In [43]:
with open('model.pkl', 'bw') as file:
    pkl.dump(model, file)
with open('encodings.pkl', 'bw') as file:
    pkl.dump(encodings, file)

In [None]:
############################################################################################################################################################################