# Modelos

In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# metricas
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

#train test
from sklearn.model_selection import train_test_split

#Transformaciones
from sklearn.preprocessing import MinMaxScaler

In [90]:
vehiculos_model=pd.read_csv("vehiculos_Featuring.csv")
vehiculos_model.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,lat,long
0,131,710.857886,2014.0,357,8,2,8.0,7622,201.976585,8589,1764,2541,6564,1096,2026,4238,32.59,-85.48
1,131,564.920851,2010.0,1371,135,2,8.0,7622,219.70152,8589,1764,2541,6564,1096,711,4238,32.59,-85.48
2,131,781.821235,2020.0,1371,20,2,8.0,7622,128.568722,8589,1764,2541,6564,1096,869,4238,32.59,-85.48
3,131,678.456336,2017.0,696,9,2,8.0,7622,175.673048,8589,1764,2541,6564,1096,869,4238,32.59,-85.48
4,131,445.568654,2013.0,1605,7,0,6.0,7622,278.743866,8589,6686,1120,1262,764,1336,4238,32.592,-85.5189


In [84]:
#vehiculos_model['condition']=vehiculos_model['condition']+1

In [91]:
# Target y Features
X= vehiculos_model.drop('condition', axis=1)
y=vehiculos_model['condition']

In [92]:
#Split para entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=2024, shuffle=True)

In [93]:
# creamos y aplicamos escalado de datos.
scaler=MinMaxScaler()

scaler.fit(X_train)

X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

# Regresión Logística

#### Sin Hiperparametros

In [40]:
logit=LogisticRegression()
logit.fit(X_train_scaled,y_train)

logit_predicts=logit.predict(X_test_scaled)
acc = accuracy_score(logit_predicts, y_test)
print("Accuracy:",acc)

Accuracy: 0.5549718574108818


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Con hiperparametros

In [187]:
logit=LogisticRegression(C=5.0,
                         solver='lbfgs',
                         multi_class='multinomial',
                         max_iter=100,
                         penalty='l2')
logit.fit(X_train_scaled,y_train)

logit_predicts=logit.predict(X_test_scaled)
acc = accuracy_score(logit_predicts, y_test)
print("Accuracy:",acc)



Accuracy: 0.5651031894934334


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Naive Bayes

#### Sin hiperparametros

In [42]:
nb_classifier=GaussianNB()
nb_classifier.fit(X_train_scaled,  y_train)

nb_predicts=nb_classifier.predict(X_test_scaled)

acc=accuracy_score(nb_predicts, y_test)
print("Accuracy:", acc)


Accuracy: 0.5136960600375234


#### Con hiperparametros

In [204]:
nb_classifier=GaussianNB(var_smoothing=0.1)
nb_classifier.fit(X_train_scaled,  y_train)

nb_predicts=nb_classifier.predict(X_test_scaled)

acc=accuracy_score(nb_predicts, y_test)
print("Accuracy:", acc)

Accuracy: 0.5185741088180112


# RandomForest

#### Sin hiperparametros

In [44]:
rf_classifier=RandomForestClassifier()
rf_classifier.fit(X_train_scaled,  y_train)

rf_predicts=rf_classifier.predict(X_test_scaled)

acc=accuracy_score(rf_predicts, y_test)
print("Accuracy:", acc)


Accuracy: 0.8652908067542214


#### con Hyperparameters

In [45]:
from sklearn.model_selection import GridSearchCV

In [47]:
hyper_parameters={
    "n_estimators":[10,50,100,500,1000],
    "criterion":['gini','entropy'],
    "max_depth":[10,20]
}

rf_classifier=RandomForestClassifier()

rf_hyp_opt=GridSearchCV(estimator=rf_classifier,param_grid=hyper_parameters, cv=10, scoring='accuracy')

rf_hyp_opt.fit(X_train_scaled, y_train)


In [49]:
rf_hyp_opt.best_score_ , rf_hyp_opt.best_params_

(0.8560580642154807,
 {'criterion': 'gini', 'max_depth': 20, 'n_estimators': 500})

### Registro del modelo

In [50]:
rf_v1=RandomForestClassifier(criterion='gini', max_depth=20,n_estimators=500)

rf_v1.fit(X_train_scaled,  y_train)

rf_v1_predicts=rf_v1.predict(X_test_scaled)

acc=accuracy_score(rf_v1_predicts, y_test)
print("Accuracy:", acc)


Accuracy: 0.8641651031894935


# Decision Tree

#### Sin hiperparametros

In [207]:
dt_classifier=DecisionTreeClassifier()
dt_classifier.fit(X_train_scaled,  y_train)

dt_predicts=dt_classifier.predict(X_test_scaled)

acc=accuracy_score(dt_predicts, y_test)
print("Accuracy:", acc)

Accuracy: 0.8168855534709193


#### Con hiperparametros

In [284]:
dt_classifier=DecisionTreeClassifier(criterion='gini',
                                     max_depth=100000,
                                     splitter="best",
                                     max_features=25,
                                     min_samples_split=2)
dt_classifier.fit(X_train_scaled,  y_train)

dt_predicts=dt_classifier.predict(X_test_scaled)

acc=accuracy_score(dt_predicts, y_test)
print("Accuracy:", acc)

Accuracy: 0.8127579737335835


# K-Neareast Neighbors

#### Sin hiperparametros

In [52]:
KN_classifier=KNeighborsClassifier()
KN_classifier.fit(X_train_scaled,  y_train)

KN_predicts=KN_classifier.predict(X_test_scaled)

acc=accuracy_score(KN_predicts, y_test)
print("Accuracy:", acc)

found 0 physical cores < 1
  File "c:\Users\HP\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


Accuracy: 0.6472795497185742


#### Con hiperparametros

In [301]:
KN_classifier=KNeighborsClassifier(n_neighbors=1,
                                   weights='distance',
                                   algorithm='ball_tree',
                                   leaf_size=20)
KN_classifier.fit(X_train_scaled,  y_train)

KN_predicts=KN_classifier.predict(X_test_scaled)

acc=accuracy_score(KN_predicts, y_test)
print("Accuracy:", acc)

Accuracy: 0.7095684803001876


# SVC

#### Sin Hiperparametros

In [302]:
svc_classifier=SVC()
svc_classifier.fit(X_train_scaled,  y_train)

svc_predicts=svc_classifier.predict(X_test_scaled)

acc=accuracy_score(svc_predicts,y_test)
print("Accuracy:", acc)

Accuracy: 0.651031894934334


#### Con hiperparametros

In [314]:
svc_classifier=SVC(kernel='poly', 
                   C=1.5,
                   degree=4,
                   gamma='scale',
                   tol=0.01)
svc_classifier.fit(X_train_scaled,  y_train)

svc_predicts=svc_classifier.predict(X_test_scaled)

acc=accuracy_score(svc_predicts,y_test)
print("Accuracy:", acc)

Accuracy: 0.7328330206378987


# Gradient Boosting

#### Sin hiperparametros

In [61]:
gd_classifier=GradientBoostingClassifier()
gd_classifier.fit(X_train_scaled,  y_train)

gd_predicts=gd_classifier.predict(X_test_scaled)

acc=accuracy_score(gd_predicts,y_test)
print("Accuracy:", acc)

Accuracy: 0.8401500938086304


#### Con hiperparametros

In [326]:
gd_classifier=GradientBoostingClassifier(n_estimators=1000, 
                                         learning_rate=0.1,
                                         min_samples_split=4)
gd_classifier.fit(X_train_scaled,  y_train)

gd_predicts=gd_classifier.predict(X_test_scaled)

acc=accuracy_score(gd_predicts,y_test)
print("Accuracy:", acc)

Accuracy: 0.8780487804878049


# Ada Boost

#### Sin hiperparametros

In [328]:
ab_classifier=AdaBoostClassifier()
ab_classifier.fit(X_train_scaled,  y_train)

ab_predicts=ab_classifier.predict(X_test_scaled)

acc=accuracy_score(ab_predicts,y_test)
print("Accuracy:", acc)



Accuracy: 0.22063789868667918


#### Con hiperparametros

In [332]:
ab_classifier=AdaBoostClassifier(n_estimators=1000,
                                 learning_rate=1.5,
                                 algorithm="SAMME",
                                 random_state=2024)
ab_classifier.fit(X_train_scaled,  y_train)

ab_predicts=ab_classifier.predict(X_test_scaled)

acc=accuracy_score(ab_predicts,y_test)
print("Accuracy:", acc)

Accuracy: 0.3883677298311445


# eXtreme Gradient Boost (XGBoost)

#### Sin hiperparametros

In [333]:
import xgboost as xgb
from xgboost import DMatrix

xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train_scaled, y_train)
xgb_predicts = xgb_classifier.predict(X_test_scaled)

acc = accuracy_score(y_test, xgb_predicts)
print("Accuracy:", acc)

Accuracy: 0.8712945590994371


#### Con hiperparametros

In [350]:
import xgboost as xgb
from xgboost import DMatrix

params={'objective': 'multi:softmax',
        'booster':'gbtree',
        'silent':1,
    'max_depth': 6,
    'learning_rate': 2.5,
    'n_estimators': 1000,
    'gamma': 0.5,
    }
#xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', n_estimators=10,seed=123)
xgb_classifier = xgb.XGBClassifier(params)
xgb_classifier.fit(X_train_scaled, y_train)
xgb_predicts = xgb_classifier.predict(X_test_scaled)

acc = accuracy_score(y_test, xgb_predicts)
print("Accuracy:", acc)



Accuracy: 0.8712945590994371


# LinearDiscriminantAnalysis (LDA)

#### Sin hiperparametros

In [107]:
lda_classifier=LinearDiscriminantAnalysis()
lda_classifier.fit(X_train_scaled,  y_train)

lda_predicts=lda_classifier.predict(X_test_scaled)

acc=accuracy_score(lda_predicts,y_test)
print("Accuracy:", acc)

Accuracy: 0.5459662288930581


#### Con hiperparametros

In [378]:
lda_classifier=LinearDiscriminantAnalysis(solver='lsqr',
                                          tol=1.0e-1)
lda_classifier.fit(X_train_scaled,  y_train)

lda_predicts=lda_classifier.predict(X_test_scaled)

acc=accuracy_score(lda_predicts,y_test)
print("Accuracy:", acc)

Accuracy: 0.5459662288930581


# Quadratic Discriminant Analysis (QDA)

#### Sin hiperparametros

In [379]:
qda_classifier=QuadraticDiscriminantAnalysis()
qda_classifier.fit(X_train_scaled,  y_train)

qda_predicts=qda_classifier.predict(X_test_scaled)

acc=accuracy_score(qda_predicts,y_test)
print("Accuracy:", acc)

Accuracy: 0.572983114446529


#### Con hiperparametros

In [382]:
qda_classifier=QuadraticDiscriminantAnalysis(tol=0.1)
qda_classifier.fit(X_train_scaled,  y_train)

qda_predicts=qda_classifier.predict(X_test_scaled)

acc=accuracy_score(qda_predicts,y_test)
print("Accuracy:", acc)

Accuracy: 0.572983114446529




# LGBM

In [None]:
#pip install lightgbm

#### Sin Hiperparametros

In [351]:
import lightgbm as lgb


lgbm_classifier = lgb.LGBMClassifier()
lgbm_classifier.fit(X_train_scaled, y_train)
lgbm_predicts = lgbm_classifier.predict(X_test_scaled)

acc = accuracy_score(y_test, lgbm_predicts)
print("Accuracy:", acc)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001004 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1235
[LightGBM] [Info] Number of data points in the train set: 6218, number of used features: 17
[LightGBM] [Info] Start training from score -1.593167
[LightGBM] [Info] Start training from score -2.022247
[LightGBM] [Info] Start training from score -0.871168
[LightGBM] [Info] Start training from score -1.947359
[LightGBM] [Info] Start training from score -5.516328
[LightGBM] [Info] Start training from score -2.310335
Accuracy: 0.876547842401501


#### Con hiperparametros

In [157]:
import lightgbm as lgb


lgbm_classifier = lgb.LGBMClassifier(objective='multiclass', 
                                     boosting_type='gbdt', 
                                     learning_rate=0.1,
                                     num_leaves=50,
                                     max_depth=25,
                                     force_col_wise=True)
lgbm_classifier.fit(X_train_scaled, y_train)
lgbm_predicts = lgbm_classifier.predict(X_test_scaled)

acc = accuracy_score(y_test, lgbm_predicts)
print("Accuracy:", acc)

[LightGBM] [Info] Total Bins 1235
[LightGBM] [Info] Number of data points in the train set: 6218, number of used features: 17
[LightGBM] [Info] Start training from score -1.593167
[LightGBM] [Info] Start training from score -2.022247
[LightGBM] [Info] Start training from score -0.871168
[LightGBM] [Info] Start training from score -1.947359
[LightGBM] [Info] Start training from score -5.516328
[LightGBM] [Info] Start training from score -2.310335
Accuracy: 0.8780487804878049


# Modelo Final

In [384]:
modelo_final=GradientBoostingClassifier(n_estimators=1000, 
                                         learning_rate=0.1,
                                         min_samples_split=4)

X_scaled=scaler.transform(X)
modelo_final.fit(X_scaled,y)
