# En este apartado vamos directamente a realizar las acciones enfocadas a Machine Learning

### Importamos librerías

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Cargamos CSV y desechamos las columnas que no influyen en el precio

#### Ya vimos en el EDA que este dataset no tenía valores nulos ni outliers destacados. Desechamos la columna precio porque es la que queremos predecir


In [6]:
df = pd.read_csv('../data/diamonds.csv')

df_ml= df.drop(["price", "'x'", "'z'", "'y'", "depth", "table"], axis= 1)
target_ml= df["price"]

df_ml.head()

Unnamed: 0,carat,cut,color,clarity
0,0.23,b'Ideal',b'E',b'SI2'
1,0.21,b'Premium',b'E',b'SI1'
2,0.23,b'Good',b'E',b'VS1'
3,0.29,b'Premium',b'I',b'VS2'
4,0.31,b'Good',b'J',b'SI2'


## Para poder realizar predicciones es necesario tranformar los valores a numéricos. Concretamente las columnas 'Cut', 'Color' y 'Clarity'

In [7]:
from sklearn.preprocessing import LabelEncoder

ml_cut= LabelEncoder()
ml_color= LabelEncoder()
ml_clarity= LabelEncoder()

df_ml['cut_ml']= ml_cut.fit_transform(df_ml['cut'])
df_ml['color_ml']= ml_color.fit_transform(df_ml['color'])
df_ml['clarity_ml']= ml_clarity.fit_transform(df_ml['clarity'])
df_ml.head()

Unnamed: 0,carat,cut,color,clarity,cut_ml,color_ml,clarity_ml
0,0.23,b'Ideal',b'E',b'SI2',2,1,3
1,0.21,b'Premium',b'E',b'SI1',3,1,2
2,0.23,b'Good',b'E',b'VS1',1,1,4
3,0.29,b'Premium',b'I',b'VS2',3,5,5
4,0.31,b'Good',b'J',b'SI2',1,6,3


## La codificación para la columna 'Cut' es: 
### 1: b'good, 2: b'ideal, 3: b'premium, 4: b'very good, 5: b'fair
## La codificación para la columna 'Color' es: 
### 0: b'D', 1: b'E', 2: b'F', 3: b'G', 4: b'H', 5: b'I', 6: b'J'
## La codificación para la columna 'Clarity' es: 
### 0: b'I1', 1: b'IF', 2: b'SI1', 3: b'SI2', 4: b'VS1', 5: b'VS2', 6: b'VVS1', 7: b'VVS2' 

## Eliminamos las columnas sin codificar

In [8]:
df_ml.drop(["cut", "color", "clarity"], axis= 1, inplace= True)
df_ml

Unnamed: 0,carat,cut_ml,color_ml,clarity_ml
0,0.23,2,1,3
1,0.21,3,1,2
2,0.23,1,1,4
3,0.29,3,5,5
4,0.31,1,6,3
...,...,...,...,...
53935,0.72,2,0,2
53936,0.72,1,0,2
53937,0.70,4,0,2
53938,0.86,3,4,3


## Importamos librerías

In [9]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [10]:
df_ml.head()

Unnamed: 0,carat,cut_ml,color_ml,clarity_ml
0,0.23,2,1,3
1,0.21,3,1,2
2,0.23,1,1,4
3,0.29,3,5,5
4,0.31,1,6,3


In [11]:
target_ml.head()

0    326.0
1    326.0
2    327.0
3    334.0
4    335.0
Name: price, dtype: float64

In [12]:
X = df_ml
y = target_ml

X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(40455, 4) (13485, 4) (40455,) (13485,)


In [13]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Creamos un baseline de los modelos a evaluar

In [16]:
# Este paso tarda 30 minutos en ejecutarse aprox

baseline = {
    'Linear': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'KNeighbors': KNeighborsRegressor(n_neighbors=2)
}

BL_names = []
BL_mean_mses = []
for model_name, model in baseline.items():
    scores = cross_val_score(model, X_train_scaled, y_train, scoring='neg_mean_squared_error')
    BL_names.append(model_name)
    BL_mean_mses.append(-scores.mean())
    print(f'Modelo {model_name} evaluado')



Modelo Linear evaluado
Modelo Random Forest evaluado
Modelo XGBoost evaluado
Modelo Decision Tree evaluado
Modelo KNeighbors evaluado


In [19]:
ml_scores = pd.DataFrame()
ml_scores['model'] = BL_names
ml_scores['mean mse'] = BL_mean_mses
ml_scores

Unnamed: 0,model,mean mse
0,Linear,1977304.0
1,Random Forest,361611.6
2,XGBoost,319711.5
3,Decision Tree,468343.7
4,KNeighbors,509282.7


### La media más baja es con el modelo de regresión lineal

In [55]:
ml_buenoL = LinearRegression()
ml_buenoL.fit(X_train_scaled, y_train)

## Momento de predecir precio para un diamante proporcionando los 4 valores C

#### Valores 'Carat' de 0 a 5 
#### Desde 0.00 a 5.01
#### Valores 'Cut' de 1 a 5 
#### 1: b'good, 2: b'ideal, 3: b'premium, 4: b'very good, 5: b'fair
#### Valores 'Color' de 0 a 6 
#### 0: b'D', 1: b'E', 2: b'F', 3: b'G', 4: b'H', 5: b'I', 6: b'J'
#### Valores 'Clarity' de 0 a 7: 
#### 0: b'I1', 1: b'IF', 2: b'SI1', 3: b'SI2', 4: b'VS1', 5: b'VS2', 6: b'VVS1', 7: b'VVS2' 


In [56]:
# Valores de diamante aleatorio
ml_buenoL.predict([[0.5 , 3, 4, 3]])

array([19420.28595216])

In [57]:
# Valores de diamante malo
ml_buenoL.predict([[2.0 , 2, 1, 6]])

array([89996.01355613])

In [58]:
# Valores de diamante bueno
ml_buenoL.predict([[2.0 , 3, 6, 3]])

array([75962.37832687])

### La media más alta es con el modelo de KNeighbors

In [59]:
ml_buenoK = KNeighborsRegressor()
ml_buenoK.fit(X_train_scaled, y_train)

In [63]:
# Valores de diamante aleatorio
ml_buenoK.predict([[0.5 , 3, 4, 3]])

array([12001.2])

In [61]:
# Valores de diamante malo
ml_buenoK.predict([[2.0 , 2, 1, 6]])

array([15346.8])

In [64]:
# Valores de diamante bueno
ml_buenoK.predict([[2.0 , 3, 6, 3]])

array([12001.2])